Home>
I am using open ai gym to create reinforcement learning for tag.
First of all, only the demon can be moved.

The problem i am having

The rules have already been created, but when I try to learn, the demons have not been learned at all.
Even if I learn 100,000 times, it doesn't work for some reason.
I think that the suspected part is return np.array (self.observation), hunter_reward, action, {}, but I don't know if there are other factors, so could you please tell me?

Error message
nothing special

import gym.spaces
import numpy as np
import pandas
import math
import matplotlib.pyplot as plt
import time
import random
class Game (gym.core.Env):
#Initial storage of initial conditions and various variables.
    def __init __ (self):
        self.hunter_Position_X = random.randint (0,5)
        self.hunter_Position_Y = random.randint (0,5)
        print ("The initial position of the demon is" + str (self.hunter_Position_X), self.hunter_Position_Y)
        It is made into a global variable with #self. Randomly arrange the x and y coordinates of the demon.
        self.fugitive_Position_X = random.randint (0,5)
        self.fugitive_Position_Y = random.randint (0,5)
        print ("The initial position of the fugitive" + str (self.fugitive_Position_X), self.fugitive_Position_Y)
        It is made into a global variable with #self. Randomly place the x and y coordinates of the fugitive. fugitive means fugitive.
        while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
            self.hunter_Position_X = random.randint (0,5)
            self.hunter_Position_Y = random.randint (0,5)
        #print (self.hunter_Position_X, self.hunter_Position_Y)
        #If the fugitive and the demon are in the exact same position, reposition the demon's initial position.
        self.game_count = 0
        # 1 We have set an upper limit for what you can do in the game. This time it will be 10 times.
        self.initial_distance = int (100 * math.sqrt ((self.hunter_Position_X-self.fugitive_Position_X) ** 2 + (self.hunter_Position_Y-self.fugitive_Position_Y) ** 2))
        print ("Initial distance is" + str (self.initial_distance))
        #Define the distance between the demon and the fugitive. Just a three-square theorem. It was multiplied by 100 to process with a natural number.
        self.lists = []
        #List to store distances.
        self.current_hunter_profit_lists = []
        #Add demon rewards for each step.
        self.current_fugitive_profit_lists = []
        #Add fugitive rewards for each step.
        self.action_space = gym.spaces.Discrete (4)
        low = np.array ([0, 0, 0, 0])
        high = np.array ([5, 5, 5, 5])
        self.observation_space = gym.spaces.Box (low, high, dtype = np.int64)
        #Defines the escape area.
        self.hunter_reward = 0
        self.fugitive_reward = 0#The rewards for demons and fugitives are initialized to 0.
        self.learn_count = 0
        #Limit the number of learning times to 10,000.
        self.lists.append (self.initial_distance)
        #Store the starting distance.
    def step (self, action):
        self.game_count + = 1
        self.learn_count + = 1
        print ("learn count", self.learn_count)
        if action == 0 and self.hunter_Position_X<5:
            self.hunter_Position_X + = 1
        if action == 1 and self.hunter_Position_X>0:
            self.hunter_Position_X-= 1
        if action == 2 and self.hunter_Position_Y<5:
            self.hunter_Position_Y + = 1
        if action == 3 and self.hunter_Position_Y>0:
            self.hunter_Position_Y-= 1
        print ("The position of the demon" + str (self.hunter_Position_X), self.hunter_Position_Y)
        print ("Fugitive position" + str (self.fugitive_Position_X), self.fugitive_Position_Y)
        #Set up 4 demon actions so that you can select them. You can move up, down, left and right.
        if action == 0 and self.hunter_Position_X == 5:
            pass
        if action == 1 and self.hunter_Position_X == 0:
            pass
        if action == 2 and self.hunter_Position_Y == 5:
            pass
        if action == 3 and self.hunter_Position_Y == 0:
            pass
        #As an exception handling, the act of going out of the area will be wasted for one turn. Writing&is prohibited because and is an unexpected operation.
        time.sleep (0.01)
        # Set the interval to 0.01 seconds.
        self.d = self.cal_distance (h_X = self.hunter_Position_X, h_Y = self.hunter_Position_Y, f_X = self.fugitive_Position_X, f_Y = self.fugitive_Position_Y)
        self.lists.append (self.d)
        #Store distance
        self.observation = (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y)
        #Store the positions of demons and fugitives each time.
        hunter_reward, fugitive_reward = self.calc_profit ()
        #Reward is calculated by calc_profitcalc_profit function, so refer to that.
        print ("Demon's reward" + str (hunter_reward), "Fugitive's reward" + str (fugitive_reward))
        print ("Oni's total reward", sum (self.current_hunter_profit_lists), "Fugitive's total reward", sum (self.current_fugitive_profit_lists))
        is_end = self.reset ()
        print ("return value is", np.array (self.observation), hunter_reward, action)
        return np.array (self.observation), hunter_reward, action, {}
        #4 values ​​are required. When learning is not good, it is necessary to change the above variable values. Include elements that determine behavior.
        #if action == 4:
            # self.fugitive_Position_X + = 1
        #if action == 5:
            # self.fugitive_Position_X-= 1
        #if action == 6:# self.fugitive_Position_Y + = 1
        #if action == 7:
            # self.fugitive_Position_Y-= 1
    def reset_position (self):
        hunter_Position_X = random.randint (0,5)
        hunter_Position_Y = random.randint (0,5)
        fugitive_Position_X = random.randint (0,5)
        fugitive_Position_Y = random.randint (0,5)
        while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
            hunter_Position_X = random.randint (0,5)
            hunter_Position_Y = random.randint (0,5)
        print ("Reset !!!")
        print ()
        return hunter_Position_X, hunter_Position_Y, fugitive_Position_X, fugitive_Position_Y
        #Leave the return value.
        # 1 Describes the instructions to be given when the conditions for ending the game are met.
        #Demon, place fugitives randomly.
    def cal_distance (self, h_X, h_Y, f_X, f_Y):
        distance = int (100 * math.sqrt ((h_X-f_X) ** 2 + (h_Y-f_Y) ** 2))
        return distance
    def calc_profit (self):
        i = self.game_count
        if i<= 10 and self.lists [i] == 0:
            self.hunter_reward + = 1
            self.fugitive_reward-= 1
            current_hunter_reward = 1
            current_fugitive_reward = -1
            self.current_hunter_profit_lists.append (current_hunter_reward)
            self.current_fugitive_profit_lists.append (current_fugitive_reward)
            print ("Securing success !!!")
            self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y = self.reset_position ()
            self.game_count = 0
            self.lists = []
            self.lists.append (self.cal_distance (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y))
            It defines the reward when the demon can be secured by # 10 actions or less. It also initializes the distance list and game count.
        elif i == 10 and (0 not in self.lists):
            self.hunter_reward-= 1
            self.fugitive_reward + = 1
            current_hunter_reward = -1
            current_fugitive_reward = 1
            self.current_hunter_profit_lists.append (current_hunter_reward)
            self.current_fugitive_profit_lists.append (current_fugitive_reward)
            print ("Failed to secure !!!")
            self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y = self.reset_position ()
            self.game_count = 0
            self.lists = []
            self.lists.append (self.cal_distance (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y))
            It defines the reward when the demon cannot be secured in # 10 actions or less. It also initializes the distance list and game count.
        elif i<= 10 and self.lists [i-1]<self.lists [i]:self.hunter_reward-= 1
            self.fugitive_reward + = 1
            current_hunter_reward = -1
            current_fugitive_reward = 1
            self.current_hunter_profit_lists.append (current_hunter_reward)
            self.current_fugitive_profit_lists.append (current_fugitive_reward)
            print ("I'm running away !!!")
            #The reward is defined by comparing the distance between the previous step and this step.
        elif i<= 10 and self.lists [i-1]>self.lists [i]:
            self.hunter_reward + = 1
            self.fugitive_reward-= 1
            current_hunter_reward = 1
            current_fugitive_reward = -1
            self.current_hunter_profit_lists.append (current_hunter_reward)
            self.current_fugitive_profit_lists.append (current_fugitive_reward)
            print ("I'm closing the distance !!!")
            #The reward is defined by comparing the distance between the previous step and this step.
        elif i<= 10 and self.lists [i-1] == self.lists [i]:
            self.hunter_reward + = 0
            self.fugitive_reward + = 0
            current_hunter_reward = 0
            current_fugitive_reward = 0
            self.current_hunter_profit_lists.append (current_hunter_reward)
            self.current_fugitive_profit_lists.append (current_fugitive_reward)
            print ("The distance hasn't changed !!!")
            #The reward is defined by comparing the distance between the previous step and this step.
        else: else:
            pass
        return current_hunter_reward, current_fugitive_reward
        #def Linear_function:
            # Y_intercept_1 = self.hunter_Position_Y --math.sqrt (3) * self.hunter_Position_X
            # Y_intercept_2 = self.hunter_Position_Y + math.sqrt (3) * self.hunter_Position_X
            # Y_intercept_3 = self.hunter_Position_Y-(1/math.sqrt (3)) * self.hunter_Position_X
            # Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt (3)) * self.hunter_Position_X
            #Y = math.sqrt (3) X + b
        #Programming works only as written.
    def reset (self):
        if self.learn_count == 0:
            is_end = True
        else: else:
            is_end = False
            #The condition to reset is only when the number of learnings has expired. At that time, the reward is reset.

Enter the language name here

python

What I tried

I tried to restore the return in step by referring to the code of cartpole in the gym.

Supplementary information (FW/tool version, etc.)
None

  • Answer # 1

    env.py

    import gym.spaces
    import numpy as np
    import pandas
    import math
    import matplotlib.pyplot as plt
    import time
    import random
    class Game (gym.core.Env):
    #Initial storage of initial conditions and various variables.
        def __init __ (self):
            self.hunter_Position_X = random.randint (0,4)
            self.hunter_Position_Y = random.randint (0,4)
            # print ("The initial position of the demon is" + str (self.hunter_Position_X), self.hunter_Position_Y)
            It is made into a global variable with #self. Randomly arrange the x and y coordinates of the demon.
            self.fugitive_Position_X = random.randint (0,4)
            self.fugitive_Position_Y = random.randint (0,4)
            # print ("The initial position of the fugitive is" + str (self.fugitive_Position_X), self.fugitive_Position_Y)
            It is made into a global variable with #self. Randomly place the x and y coordinates of the fugitive. fugitive means fugitive.
            while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
                self.hunter_Position_X = random.randint (0,4)
                self.hunter_Position_Y = random.randint (0,4)
            #print (self.hunter_Position_X, self.hunter_Position_Y)
            #If the fugitive and the demon are in the exact same position, reposition the demon's initial position.
            self.game_count = 0
            # 1 We have set an upper limit for what you can do in the game. This time it will be 10 times.
            self.initial_distance = int (100 * math.sqrt ((self.hunter_Position_X-self.fugitive_Position_X) ** 2 + (self.hunter_Position_Y-self.fugitive_Position_Y) ** 2))
            # print ("Initial distance is" + str (self.initial_distance))
            #Define the distance between the demon and the fugitive. Just a three-square theorem. It was multiplied by 100 to process with a natural number.
            self.lists = []
            #List to store distances.
            self.current_hunter_profit_lists = []
            #Add demon rewards for each step.
            self.current_fugitive_profit_lists = []
            #Add fugitive rewards for each step.
            self.action_space = gym.spaces.Discrete (4)
            low = np.array ([0, 0, 0, 0])
            high = np.array ([4, 4, 4, 4])
            self.observation_space = gym.spaces.Box (low, high, dtype = np.int64)
            #Defines the escape area.
            self.hunter_reward = 0
            self.fugitive_reward = 0
            #The rewards for demons and fugitives are initialized to 0.
            self.learn_count = 0
            #Limit the number of learning times to 10,000.
            self.lists.append (self.initial_distance)
            #Store the starting distance.
        def step (self, action):
            self.game_count + = 1
            self.learn_count + = 1
            # print ("learning count", self.learn_count)
            if action == 0 and self.hunter_Position_X<5:
                self.hunter_Position_X + = 1
            if action == 1 and self.hunter_Position_X>= 0:
                self.hunter_Position_X-= 1
            if action == 2 and self.hunter_Position_Y<5:
                self.hunter_Position_Y + = 1
            if action == 3 and self.hunter_Position_Y>= 0:
                self.hunter_Position_Y-= 1
            # print ("The position of the demon" + str (self.hunter_Position_X), self.hunter_Position_Y)
            # print ("Fugitive position" + str (self.fugitive_Position_X), self.fugitive_Position_Y)
            #Set up 4 demon actions so that you can select them. You can move up, down, left and right.
            if action == 0 and self.hunter_Position_X == 5:
                pass
            if action == 1 and self.hunter_Position_X == -1:
                pass
            if action == 2 and self.hunter_Position_Y == 5:pass
            if action == 3 and self.hunter_Position_Y == -1:
                pass
            #As an exception handling, the act of going out of the area will be wasted for one turn. Writing&is prohibited because and is an unexpected operation.
            # time.sleep (0.01)
            # Set the interval to 0.01 seconds.
            self.d = self.cal_distance (h_X = self.hunter_Position_X, h_Y = self.hunter_Position_Y, f_X = self.fugitive_Position_X, f_Y = self.fugitive_Position_Y)
            self.lists.append (self.d)
            #Store distance
            self.observation = (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y)
            #Store the positions of demons and fugitives each time.
            hunter_reward, fugitive_reward = self.calc_profit ()
            #Reward is calculated by calc_profitcalc_profit function, so refer to that.
            # print ("Demon's reward is" + str (hunter_reward), "Fugitive's reward is" + str (fugitive_reward))
            print ("Oni's total reward", sum (self.current_hunter_profit_lists), "Fugitive's total reward", sum (self.current_fugitive_profit_lists))
            is_end = self.reset ()
            # print ("return value is", np.array (self.observation), hunter_reward, action)
            return np.array (self.observation), hunter_reward, action, {}
            #4 values ​​are required. When learning is not good, it is necessary to change the above variable values. Include elements that determine behavior.
            #if action == 4:
                # self.fugitive_Position_X + = 1
            #if action == 5:
                # self.fugitive_Position_X-= 1
            #if action == 6:
                # self.fugitive_Position_Y + = 1
            #if action == 7:
                # self.fugitive_Position_Y-= 1
        def reset_position (self):
            hunter_Position_X = random.randint (0,4)
            hunter_Position_Y = random.randint (0,4)
            fugitive_Position_X = random.randint (0,4)
            fugitive_Position_Y = random.randint (0,4)
            while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
                hunter_Position_X = random.randint (0,4)
                hunter_Position_Y = random.randint (0,4)
            print ("Reset !!!")
            print ()
            return hunter_Position_X, hunter_Position_Y, fugitive_Position_X, fugitive_Position_Y
            #Leave the return value.
            # 1 Describes the instructions to be given when the conditions for ending the game are met.
            #Demon, place fugitives randomly.
        def cal_distance (self, h_X, h_Y, f_X, f_Y):
            distance = int (100 * math.sqrt ((h_X-f_X) ** 2 + (h_Y-f_Y) ** 2))
            return distance
        def calc_profit (self):
            i = self.game_count
            if i<= 10 and self.lists [i] == 0:
                self.hunter_reward + = 1
                self.fugitive_reward-= 1
                current_hunter_reward = 1
                current_fugitive_reward = -1
                self.current_hunter_profit_lists.append (current_hunter_reward)
                self.current_fugitive_profit_lists.append (current_fugitive_reward)
                # print ("Securing success !!!")
                self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y = self.reset_position ()
                self.game_count = 0
                self.lists = []
                self.lists.append (self.cal_distance (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y))
                It defines the reward when the demon can be secured by # 10 actions or less. It also initializes the distance list and game count.
            elif i == 10 and (0 not in self.lists):
                self.hunter_reward-= 1
                self.fugitive_reward + = 1
                current_hunter_reward = -1
                current_fugitive_reward = 1
                self.current_hunter_profit_lists.append (current_hunter_reward)
                self.current_fugitive_profit_lists.append (current_fugitive_reward)
                # print ("Failed to secure !!!")self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y = self.reset_position ()
                self.game_count = 0
                self.lists = []
                self.lists.append (self.cal_distance (self.hunter_Position_X, self.hunter_Position_Y, self.fugitive_Position_X, self.fugitive_Position_Y))
                It defines the reward when the demon cannot be secured in # 10 actions or less. It also initializes the distance list and game count.
            elif i<= 10 and self.lists [i-1]<self.lists [i]:
                self.hunter_reward-= 1
                self.fugitive_reward + = 1
                current_hunter_reward = -1
                current_fugitive_reward = 1
                self.current_hunter_profit_lists.append (current_hunter_reward)
                self.current_fugitive_profit_lists.append (current_fugitive_reward)
                # print ("I'm running away !!!")
                #The reward is defined by comparing the distance between the previous step and this step.
            elif i<= 10 and self.lists [i-1]>self.lists [i]:
                self.hunter_reward + = 1
                self.fugitive_reward-= 1
                current_hunter_reward = 1
                current_fugitive_reward = -1
                self.current_hunter_profit_lists.append (current_hunter_reward)
                self.current_fugitive_profit_lists.append (current_fugitive_reward)
                #print ("You're closing the distance !!!")
                #The reward is defined by comparing the distance between the previous step and this step.
            elif i<= 10 and self.lists [i-1] == self.lists [i]:
                self.hunter_reward + = 0
                self.fugitive_reward + = 0
                current_hunter_reward = 0
                current_fugitive_reward = 0
                self.current_hunter_profit_lists.append (current_hunter_reward)
                self.current_fugitive_profit_lists.append (current_fugitive_reward)
                # print ("The distance hasn't changed !!!")
                #The reward is defined by comparing the distance between the previous step and this step.
            else: else:
                pass
            return current_hunter_reward, current_fugitive_reward
            #def Linear_function:
                # Y_intercept_1 = self.hunter_Position_Y --math.sqrt (3) * self.hunter_Position_X
                # Y_intercept_2 = self.hunter_Position_Y + math.sqrt (3) * self.hunter_Position_X
                # Y_intercept_3 = self.hunter_Position_Y-(1/math.sqrt (3)) * self.hunter_Position_X
                # Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt (3)) * self.hunter_Position_X
                #Y = math.sqrt (3) X + b
            #Programming works only as written.
        def reset (self):
            if self.learn_count == 0:
                is_end = True
            else: else:
                is_end = False
                #The condition to reset is only when the number of learnings has expired. At that time, the reward is reset.

  • Answer # 2

    Postscript:
    Use runner.py in this answer column and envs.py in another answer.


    ■ Until you move
    As I wrote in the comment, the gym side can only provide an environment, so it seems that I have to prepare an environment for Q-learning on my own.

    Save the code below with a suitable name. I think the save destination should be directly under "gym-master".
    After learning about 500 times, epsilon and learning rate will run out. From that time on, the demons will start chasing properly.

    ■ Technical precautions
    bins_size,low_bound,high_boundIs very important, and I think that if this is done properly, it can be reused considerably.

    variable Explanation
    bins_size Number of elements = number of actions, value of element = number of options for actions
    low_bound Minimum value of choices for action
    high_bound Maximum choices for action

    It seems that this "number of choices that can be taken" needs to be discrete values, and in reality you have to do various troublesome things, but Takahiro Kubo's code absorbs it. It's easier to handle the code in one place, so I copied and pasted it into one.

    ■ Supplement
    time.sleepIt may be smoother if there is no.

    ■ Acknowledgments
    Takahiro Kubo: Thank you for taking this opportunity to publish the wonderful code. Thank you very much.
    shi.hi: I got an opportunity to start reinforcement learning. Thank you very much.

    runner.py

    #Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
    # that provided as MIT license by Takahiro Kubo.
    #This was modified from "handson3.py".
    import os
    import sys
    import math
    import argparse
    import gym
    RECORD_PATH = os.path.join (os.path.dirname (__ file__), "./upload")
    from collections import defaultdict
    import numpy as np
    ####
    class COMMON ():
        # target_env = "myenv-v1" # "CartPole-v0"
        # target_env = "CartPole-v0"
        target_env = "myenv-v1"
        if target_env == "" "CartPole-v0":
            bins_size = [3, 3, 8, 5] # number of splitted parameters
            low_bound = [None, -0.5, None, -math.radians (50)] #Limit of minimum value for each parameter
            high_bound = [None, 0.5, None, math.radians (50)] #Limit of maximum value for each parameter
        else: else:
            bins_size = [5, 5, 5, 5]
            low_bound = [0, 0, 0, 0] #Limit of minimum value for each parameter
            high_bound = [4, 4, 4, 4] #Limit of maximum value for each parameter
    ####
    # Copied from "q.py"
    class Q ():
        def __init__ (self, n_actions, observation_space, bin_size, low_bound = None, high_bound = None, initial_mean = 0.0, initial_std = 0.0):
            self.n_actions = n_actions
            self._observation_dimension = 1
            for d in observation_space.shape:
                self._observation_dimension * = d
            self._bin_sizes = bin_size if isinstance (bin_size, list) else [bin_size] * self._observation_dimension
            self._dimension_bins = []
            for i, low, high in self._low_high_iter (observation_space, low_bound, high_bound):
                b_size = self._bin_sizes [i]
                bins = self._make_bins (low, high, b_size)
                self._dimension_bins.append (bins)
            #if we encounter the new observation, we initialize action evaluations
            self.table = defaultdict (lambda: initial_std * np.random.randn (self.n_actions) + initial_mean)
        @classmethod
        def _make_bins (cls, low, high, bin_size):
            bins = np.arange (low, high, (float (high) --float (low))/(bin_size --2)) #exclude both ends
            if min (bins)<0 and 0 not in bins:
                bins = np.sort (np.append (bins, [0])) # 0 centric bins
            return bins
        @classmethod
        def _low_high_iter (cls, observation_space, low_bound, high_bound):lows = observation_space.low
            highs = observation_space.high
            for i in range (len (lows)):
                low = lows [i]
                if low_bound is not None:
                    _low_bound = low_bound if not is instance (low_bound, list) else low_bound [i]
                    low = low if _low_bound is None else max (low, _low_bound)
                high = highs [i]
                if high_bound is not None:
                    _high_bound = high_bound if not is instance (high_bound, list) else high_bound [i]
                    high = high if _high_bound is None else min (high, _high_bound)
                yield i, low, high
        def observation_to_state (self, observation, target_env):
            if target_env == "CartPole-v0":
                state = 0
                # caution: bin_size over 10 will not work accurately
                unit = max (self._bin_sizes)
                for d, o in enumerate (observation.flatten ()):
                    state = state + np.digitize (o, self._dimension_bins [d]) * pow (unit, d) # bin_size identifier system
            else: else:
                state = 0
                unit = max (self._bin_sizes)
                if observation is None:
                    pass
                else: else:
                    for d, o in enumerate (np.asarray (observation) .flatten ()):
                        state = state + np.digitize (o, self._dimension_bins [d]) * pow (unit, d) # bin_size identifier system
            return state
        def values ​​(self, observation, target_env):
            state = self.observation_to_state (observation, target_env)
            return self.table [state]
    ####
    # Copied from "agent.py"
    import random
    import numpy as np
    class Agent ():
        def __init __ (self, q, epsilon = 0.05):
            self.q = q
            self.epsilon = epsilon
        def act (self, observation, target_env):
            # your code here
            action = -1
            if np.random.random ()<self.epsilon:
                action = np.random.choice (self.q.n_actions)
            else: else:
                action = np.argmax (self.q.values ​​(observation, target_env))
            return action
    ####
    # Copied from "trainer.py"
    from collections import deque
    class Trainer ():
        def __init__ (self, agent, target_env, gamma = 0.95, learning_rate = 0.1, learning_rate_decay = None, epsilon = 0.05, epsilon_decay = None, max_step = -1):
            self.agent = agent
            self.target_env = target_env
            self.gamma = gamma
            self.learning_rate = learning_rate
            self.learning_rate_decay = learning_rate_decay
            self.epsilon = epsilon
            self.epsilon_decay = epsilon_decay
            self.max_step = max_step
        def train (self, env, episode_count, render = False):
            default_epsilon = self.agent.epsilon
            self.agent.epsilon = self.epsilon
            values ​​= []
            steps = deque (maxlen = 100)
            lr = self.learning_ratefor i in range (episode_count):
                obs = env.reset ()
                step = 0
                done = False
                while not done:
                    if render:
                        if self.target_env == "myenv-v1":
                            print ("Not supported yet.")
                        else: else:
                            env.render ()
                    action = self.agent.act (obs, self.target_env)
                    next_obs, reward, done, _ = env.step (action)
                    state = self.agent.q.observation_to_state (obs, self.target_env)
                    future = 0 if done else np.max (self.agent.q.values ​​(next_obs, self.target_env))
                    value = self.agent.q.table [state] [action]
                    self.agent.q.table [state] [action] + = lr * (reward + self.gamma * future --value)
                    obs = next_obs
                    values.append (value)
                    step + = 1
                    if self.max_step>0 and step>self.max_step:
                        done = True
                else: else:
                    mean = np.mean (values)
                    steps.append (step)
                    mean_step = np.mean (steps)
                    print ("Episode {}: {} steps (avg {}). epsilon = {: .3f},
     lr = {: .3f},
     mean q value = {: .2f} ". format (
                        i, step, mean_step, self.agent.epsilon, lr, mean)
                        )
                    if self.epsilon_decay is not None:
                        self.agent.epsilon = self.epsilon_decay (self.agent.epsilon, i)
                    if self.learning_rate_decay is not None:
                        lr = self.learning_rate_decay (lr, i)
    def main (episodes, render):
        env = gym.make (COMMON.target_env)
        q = Q (
            env.action_space.n,
            env.observation_space,
            bin_size = COMMON.bins_size,
            low_bound = COMMON.low_bound,
            high_bound = COMMON.high_bound
            )
        agent = Agent (q, epsilon = 0.05)
        learning_decay = lambda lr, t: max (0.1, min (0.5, 1.0 --math.log10 ((t + 1)/25)))
        epsilon_decay = lambda eps, t: max (0.01, min (1.0, 1.0 --math.log10 ((t + 1)/25))))
        trainer = Trainer (
            agent,
            target_env = COMMON.target_env,
            gamma = 0.99,
            learning_rate = 0.5, learning_rate_decay = learning_decay,
            epsilon = 1.0, epsilon_decay = epsilon_decay,
            max_step = 250)
        trainer.train (env, episode_count = episodes, render = render)
    if __name__ == "__main__":
        parser = argparse.ArgumentParser (description = "train&run cartpole")
        parser.add_argument ("--episode", type = int, default = 1000, help = "episode to train")
        parser.add_argument ("--render", action = "store_true", help = "render the screen")
        args = parser.parse_args ()
        main (args.episode, args.render)