Home>

I am studying reinforcement learning using openAIgym, but an error message is displayed at the end of drawing with env.render ()
In the code below, the number of episodes is 200, so I think it can be executed immediately with copy and paste, but after 200 episodes are completed, an error message is displayed. I think that it is probably because the window is not closed by just rendering, but I don't know how to deal with it.

Error message
Exception ignored in:<bound method Viewer .__ del__ of<gym.envs.classic_control.rendering.Viewer object at 0x0000016230960D68 >>
Traceback (most recent call last):
  File "C: \ Users \ xxx \ Anaconda3 \ envs \ OpenAIGym_pipenv \ lib \ site-packages \ gym \ envs \ classic_control \ rendering.py", line 152, in __del__
  File "C: \ Users \ xxx \ Anaconda3 \ envs \ OpenAIGym_pipenv \ lib \ site-packages \ gym \ envs \ classic_control \ rendering.py", line 71, in close
  File "C: \ Users \ xxx \ Anaconda3 \ envs \ OpenAIGym_pipenv \ lib \ site-packages \ pyglet \ window \ win32 \ __ init__.py", line 305, in close
  File "C: \ Users \ xxx \ Anaconda3 \ envs \ OpenAIGym_pipenv \ lib \ site-packages \ pyglet \ window \ __ init__.py", line 770, in close
ImportError: sys.meta_path is None, Python is likely shutting down
Applicable source code
import gym
import matplotlib.pyplot as plt
import numpy as np
NUM_DIGITIZE = 8
NUM_ACTION = 16
ACTION = np.linspace (-2, 2, NUM_ACTION)
NUM_EPISODES = int (200)
MAX_STEP = 100
ETA = 0.5
GAMMA = 0.9
ENV = 'Pendulum-v0'

class Agent:
    def __init __ (self, num_states, num_actions):
        self.brain = Brain (num_states, num_actions)
    def update_Q_function (self, observation, action, reward, observation_next):
        self.brain.update_Q_table (observation, action, reward, observation_next)
    def get_action (self, observation, step):
        action = self.brain.decide_action (observation, step)
        return action

class Brain:
    def __init __ (self, num_states, num_actions):
        self.num_actions = NUM_ACTIONself.q_table = np.random.uniform (0, 1, size = (NUM_DIGITIZE ** num_states, NUM_ACTION))
    def bins (self, clip_min, clip_max, num):
        return np.linspace (clip_min, clip_max, num + 1) [1: -1]
    def digitize_state (self, observation):
        cos, sin, w = observation
        digitized = [
            np.digitize (cos, bins = self.bins (-1.0, 1.0, NUM_DIGITIZE)),
            np.digitize (sin, bins = self.bins (-1.0, 1.0, NUM_DIGITIZE)),
            np.digitize (w, bins = self.bins (-8.0, 8.0, NUM_DIGITIZE))]
        return sum ([x * (NUM_DIGITIZE ** i) for i, x in enumerate (digitized)])
    def digitize_action (self, action):
        return np.digitize (action, bins = self.bins (-2, 2, NUM_ACTION))
    def update_Q_table (self, observation, action, reward, observation_next):
        state = self.digitize_state (observation)
        state_next = self.digitize_state (observation_next)
        Max_Q_next = max (self.q_table [state_next] [:])
        self.q_table [state, action] = self.q_table [state, action] + \
                                      ETA * (reward + GAMMA * Max_Q_next-self.q_table [state, action])
    def decide_action (self, observation, episode):
        state = self.digitize_state (observation)
        epsilon = 0.5 * (1/(1 + episode))
        if epsilon<= np.random.uniform (0, 1):
            action = np.argmax (self.q_table [state] [:])
        else:
            action = np.random.choice (self.num_actions)
        Action = ACTION [action]
        return (Action, action)

class Logger:
    def __init __ (self):
        self.log = []
    def log_func (self, x):
        self.log.append (x)

class Environment:
    def __init __ (self):self.env = gym.make (ENV)
        self.Log = Logger ()
        num_states = self.env.observation_space.shape [0]
        num_actions = self.env.action_space.shape [0]
        self.agent = Agent (num_states, num_actions)
    def run (self):
        for episode in range (NUM_EPISODES):
            print (f "{episode} episode start")
            XXX = self.Log
            print (XXX)
            observation = self.env.reset ()
            for step in range (MAX_STEP):
                if episode% 100 == 0:
                    self.env.render ()
                action_index = self.agent.get_action (observation, step) [1]
                Action = self.agent.get_action (observation, step) [0]
                # print (Action)
                observation_next, reward, done, _ = self.env.step ([Action])
                self.Log.log_func (reward)
                if done:
                    reward = -10
                else:
                    reward = reward
                self.agent.update_Q_function (observation, action_index, reward, observation_next)
                observation = observation_next
                if done:
                    break

if __name__ == '__main__':
    cartpole_env = Environment ()
    cartpole_env.run ()

Please describe what you tried for the problem here.

Supplemental information (FW/tool version etc.)

pycharm