回答編集履歴

追記

2020/11/22 18:42

投稿

退会済みユーザー

スコア0

answer CHANGED Viewed

@@ -1,3 +1,9 @@
+**追記:**
+この解答欄のrunner.pyと別の回答のenvs.pyを使ってください。
+---
 **■動かすまで**
 コメントにも書きましたが、gym側は環境を提供するしかできないため、自前でQ学習をする環境を用意してあげないといけないようです。
@@ -12,6 +18,7 @@
 |low_bound|行動の取りえる選択肢の最小値|
 |high_bound|行動の取りえる選択肢の最大値|
 この「行動の取りえる選択肢数」は離散値である必要があるそうで、本当はいろいろめんどくさいことをやらないといけませんが、Takahiro Kuboさんのコードがそれを吸収してくれます。コードをひとまとめにした方が扱いやすいため、コピペで１つにまとめました。
 **■補足**
@@ -21,209 +28,224 @@
 Takahiro Kuboさん：素晴らしいコードを公開いただいた漢気にこの場を使って感謝いたします。ありがとうございます。
 shi.hiさん：強化学習に手を付けるきっかけを頂けました。ありがとうございます。
-envs.py
+runner.py
 ```Python3
+# Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
+# that provided as MIT license by Takahiro Kubo.
+# This was modified from "handson3.py".
-import gym.spaces
+import os
-import numpy as np
-import pandas
+import sys
 import math
-import matplotlib.pyplot as plt
+import argparse
-import time
+import gym
-import random
-class Game(gym.core.Env):
-#初期条件や各種変数の初期格納する。
-    def __init__(self):
-        self.hunter_Position_X=random.randint(0,5)
-        self.hunter_Position_Y=random.randint(0,5)
-        # print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
-        #selfでグローバル変数化している。鬼のx,y座標をランダムに配置。
-        self.fugitive_Position_X=random.randint(0,5)
-        self.fugitive_Position_Y=random.randint(0,5)
-        # print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
-        #selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。
-        while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
-            self.hunter_Position_X=random.randint(0,5)
-            self.hunter_Position_Y=random.randint(0,5)
-        #print(self.hunter_Position_X,self.hunter_Position_Y)
-        #逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。
-        self.game_count=0
-        #1ゲームで行動できる上限を設定をしている。今回は10回とする。
-        self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
-        # print("初期の距離は"+str(self.initial_distance))
-        #鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
-        self.lists = []
-        #距離を格納するリスト。
-        self.current_hunter_profit_lists = []
-        #鬼の報酬を各ステップごとに加える。
-        self.current_fugitive_profit_lists = []
-        #逃亡者の報酬を各ステップごとに加える。
-        self.action_space = gym.spaces.Discrete(4)
-        low = np.array([0, 0, 0, 0])
-        high = np.array([5, 5, 5, 5])
-        self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
+RECORD_PATH = os.path.join(os.path.dirname(__file__), "./upload")
-        #逃走エリアを定義している。
-        self.hunter_reward=0
-        self.fugitive_reward=0
-        #鬼と逃亡者の報酬を0で初期化している。
-        self.learn_count=0
-        #学習回数を10000回と制限。
-        self.lists.append(self.initial_distance)
+from collections import defaultdict
-        #開始時の距離を格納する。
+import numpy as np
+####
-    def step(self,action):
+class COMMON():
-        self.game_count += 1
-        self.learn_count += 1
+    # target_env = "myenv-v1" # "CartPole-v0"
-        # print("学習回数は",self.learn_count)
+    # target_env = "CartPole-v0"
-        if action == 0 and self.hunter_Position_X < 5:
+    target_env = "myenv-v1"
-            self.hunter_Position_X += 1
-        if action == 1 and self.hunter_Position_X > 0:
-            self.hunter_Position_X -= 1
-        if action == 2 and self.hunter_Position_Y < 5:
-            self.hunter_Position_Y += 1
-        if action == 3 and self.hunter_Position_Y > 0:
-            self.hunter_Position_Y -= 1
-        # print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
-        # print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
-        #鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
+    if target_env == "" "CartPole-v0":
+        bins_size = [3, 3, 8, 5] # number of splitted parameters
+        low_bound= [None, -0.5, None, -math.radians(50)] # Limit of minimum value for each parameter
+        high_bound= [None, 0.5, None, math.radians(50)] # Limit of maximum value for each parameter
-        if action == 0 and self.hunter_Position_X == 5:
-            pass
+    else:
+        bins_size= [5, 5, 5, 5]
-        if action == 1 and self.hunter_Position_X == 0:
+        low_bound= [0, 0, 0, 0] # Limit of minimum value for each parameter
+        high_bound= [4, 4, 4, 4] # Limit of maximum value for each parameter
-            pass
+####
-        if action == 2 and self.hunter_Position_Y == 5:
+# Copied from "q.py"
-            pass
+class Q():
-        if action == 3 and self.hunter_Position_Y == 0:
-            pass
-        #例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
-        # time.sleep(0.01)
-        #間隔を0.01秒とする。
-        self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
-        self.lists.append(self.d)
-        #距離を格納
-        self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
-        #鬼と逃亡者の位置を毎回格納する。
-        hunter_reward,fugitive_reward=self.calc_profit()
-        #報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
-        # print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
-        print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))
+    def __init__(self, n_actions, observation_space, bin_size, low_bound=None, high_bound=None, initial_mean=0.0, initial_std=0.0):
+        self.n_actions = n_actions
+        self._observation_dimension = 1
+        for d in observation_space.shape:
+            self._observation_dimension *= d
+        self._bin_sizes = bin_size if isinstance(bin_size, list) else [bin_size] * self._observation_dimension
+        self._dimension_bins = []
+        for i, low, high in self._low_high_iter(observation_space, low_bound, high_bound):
-        is_end = self.reset()
+            b_size = self._bin_sizes[i]
+            bins = self._make_bins(low, high, b_size)
+            self._dimension_bins.append(bins)
-        # print("return値は",np.array(self.observation),hunter_reward,action)
-        return np.array(self.observation),hunter_reward,action,{}
+        # if we encounter the new observation, we initialize action evaluations
-        #値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。
+        self.table = defaultdict(lambda: initial_std * np.random.randn(self.n_actions) + initial_mean)
+    @classmethod
+    def _make_bins(cls, low, high, bin_size):
+        bins = np.arange(low, high, (float(high) - float(low)) / (bin_size - 2))  # exclude both ends
+        if min(bins) < 0 and 0 not in bins:
+            bins = np.sort(np.append(bins, [0]))  # 0 centric bins
-        #if action == 4:
+        return bins
-            #self.fugitive_Position_X += 1
-        #if action == 5:
-            #self.fugitive_Position_X -= 1
-        #if action == 6:
-            #self.fugitive_Position_Y += 1
-        #if action == 7:
-            #self.fugitive_Position_Y -= 1
+    @classmethod
+    def _low_high_iter(cls, observation_space, low_bound, high_bound):
+        lows = observation_space.low
+        highs = observation_space.high
-    def reset_position(self):
+        for i in range(len(lows)):
+            low = lows[i]
-        hunter_Position_X=random.randint(0,5)
+            if low_bound is not None:
-        hunter_Position_Y=random.randint(0,5)
-        fugitive_Position_X=random.randint(0,5)
-        fugitive_Position_Y=random.randint(0,5)
-        while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
+                _low_bound = low_bound if not isinstance(low_bound, list) else low_bound[i]
-            hunter_Position_X=random.randint(0,5)
-            hunter_Position_Y=random.randint(0,5)
-        print("リセットされました！！！")
-        print()
-        return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
+                low = low if _low_bound is None else max(low, _low_bound)
-        #返り値を残しておく。
-        #1ゲームの終了条件を満たしたときに行う指示を記載。
-        #鬼、逃亡者をランダムに配置する。
-    def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
-        distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
-        return distance
+            high = highs[i]
+            if high_bound is not None:
+                _high_bound = high_bound if not isinstance(high_bound, list) else high_bound[i]
+                high = high if _high_bound is None else min(high, _high_bound)
-    def calc_profit(self):
-        i= self.game_count
-        if i <= 10 and self.lists[i] == 0:
-            self.hunter_reward += 1
-            self.fugitive_reward -= 1
-            current_hunter_reward = 1
-            current_fugitive_reward = -1
-            self.current_hunter_profit_lists.append(current_hunter_reward)
-            self.current_fugitive_profit_lists.append(current_fugitive_reward)
-            # print("確保成功！！！")
-            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
-            self.game_count = 0
-            self.lists = []
+            yield i, low, high
-            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
-            #10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
-        elif i == 10 and (0 not in self.lists):
-            self.hunter_reward -= 1
-            self.fugitive_reward += 1
-            current_hunter_reward = -1
-            current_fugitive_reward = 1
-            self.current_hunter_profit_lists.append(current_hunter_reward)
+    def observation_to_state(self, observation,target_env):
-            self.current_fugitive_profit_lists.append(current_fugitive_reward)
-            # print("確保失敗！！！")
+        if  target_env == "CartPole-v0":
-            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
-            self.game_count = 0
-            self.lists = []
+            state = 0
-            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
-            #10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
+            # caution: bin_size over 10 will not work accurately
+            unit = max(self._bin_sizes)
-        elif i <= 10 and self.lists[i-1] < self.lists[i]:
+            for d, o in enumerate(observation.flatten()):
-            self.hunter_reward -= 1
-            self.fugitive_reward += 1
-            current_hunter_reward = -1
-            current_fugitive_reward = 1
-            self.current_hunter_profit_lists.append(current_hunter_reward)
-            self.current_fugitive_profit_lists.append(current_fugitive_reward)
+                state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
-            # print("逃げられてるよ！！！")
+        else:
-            #前回ステップと今回のステップで距離を比較して報酬を定義している。
-        elif i <= 10 and self.lists[i-1] > self.lists[i]:
-            self.hunter_reward += 1
+            state = 0
-            self.fugitive_reward -= 1
-            current_hunter_reward = 1
+            unit = max(self._bin_sizes)
-            current_fugitive_reward = -1
-            self.current_hunter_profit_lists.append(current_hunter_reward)
-            self.current_fugitive_profit_lists.append(current_fugitive_reward)
-            # print("距離を詰めてるね！！！")
-            #前回ステップと今回のステップで距離を比較して報酬を定義している。
-        elif i <= 10 and self.lists[i-1] == self.lists[i]:
-            self.hunter_reward += 0
+            if observation is  None:
+                pass
-            self.fugitive_reward += 0
+            else:
-            current_hunter_reward = 0
-            current_fugitive_reward = 0
-            self.current_hunter_profit_lists.append(current_hunter_reward)
+                for d, o in enumerate(np.asarray(observation).flatten()):
-            self.current_fugitive_profit_lists.append(current_fugitive_reward)
+                    state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
-            # print("距離が変わってないよ！！！")
+        return state
-            #前回ステップと今回のステップで距離を比較して報酬を定義している。
+    def values(self, observation,target_env):
+        state = self.observation_to_state(observation,target_env)
+        return self.table[state]
+####
+# Copied from "agent.py"
+import random
+import numpy as np
+class Agent():
+    def __init__(self, q, epsilon=0.05):
+        self.q = q
+        self.epsilon = epsilon
+    def act(self, observation,target_env):
+        # your code here
+        action = -1
+        if np.random.random() < self.epsilon:
+            action = np.random.choice(self.q.n_actions)
         else:
-            pass
+            action = np.argmax(self.q.values(observation,target_env))
-        return current_hunter_reward,current_fugitive_reward
+        return action
+####
-        #def Linear_function:
+# Copied from "trainer.py"
-            #Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X
-            #Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X
-            #Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X
-            #Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X
-            #Y = math.sqrt(3)X + b
+from collections import deque
-        #プログラミングは書いた通りにしか動かない。
-    def reset(self):
+class Trainer():
-        if self.learn_count == 0:
-            is_end = True
-        else:
-            is_end = False
-            #リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。
+    def __init__(self, agent,target_env, gamma=0.95, learning_rate=0.1, learning_rate_decay=None, epsilon=0.05, epsilon_decay=None, max_step=-1):
+        self.agent = agent
+        self.target_env = target_env
+        self.gamma = gamma
+        self.learning_rate = learning_rate
+        self.learning_rate_decay = learning_rate_decay
+        self.epsilon = epsilon
+        self.epsilon_decay = epsilon_decay
+        self.max_step = max_step
+    def train(self, env, episode_count, render=False):
+        default_epsilon = self.agent.epsilon
+        self.agent.epsilon = self.epsilon
+        values = []
+        steps = deque(maxlen=100)
+        lr = self.learning_rate
+        for i in range(episode_count):
+            obs = env.reset()
+            step = 0
+            done = False
+            while not done:
+                if render:
+                    if self.target_env ==  "myenv-v1":
+                        print("Not supported yet.")
+                    else:
+                        env.render()
+                action = self.agent.act(obs,self.target_env)
+                next_obs, reward, done, _ = env.step(action)
+                state = self.agent.q.observation_to_state(obs,self.target_env)
+                future = 0 if done else np.max(self.agent.q.values(next_obs,self.target_env))
+                value = self.agent.q.table[state][action]
+                self.agent.q.table[state][action] += lr * (reward + self.gamma * future - value)
+                obs = next_obs
+                values.append(value)
+                step += 1
+                if self.max_step > 0 and step > self.max_step:
+                    done = True
+            else:
+                mean = np.mean(values)
+                steps.append(step)
+                mean_step = np.mean(steps)
+                print("Episode {}: {}steps(avg{}). epsilon={:.3f}, lr={:.3f}, mean q value={:.2f}".format(
+                    i, step, mean_step, self.agent.epsilon, lr, mean)
+                    )
+                if self.epsilon_decay is not None:
+                    self.agent.epsilon = self.epsilon_decay(self.agent.epsilon, i)
+                if self.learning_rate_decay is not None:
+                    lr = self.learning_rate_decay(lr, i)
+def main(episodes, render):
+    env = gym.make(COMMON.target_env)
+    q = Q(
+        env.action_space.n,
+        env.observation_space,
+        bin_size= COMMON.bins_size,
+        low_bound= COMMON.low_bound,
+        high_bound= COMMON.high_bound
+        )
+    agent = Agent(q, epsilon=0.05)
+    learning_decay = lambda lr, t: max(0.1, min(0.5, 1.0 - math.log10((t + 1) / 25)))
+    epsilon_decay = lambda eps, t: max(0.01, min(1.0, 1.0 - math.log10((t + 1) / 25)))
+    trainer = Trainer(
+        agent,
+        target_env = COMMON.target_env,
+        gamma=0.99,
+        learning_rate=0.5, learning_rate_decay=learning_decay,
+        epsilon=1.0, epsilon_decay=epsilon_decay,
+        max_step=250)
+    trainer.train(env, episode_count=episodes, render=render)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="train & run cartpole ")
+    parser.add_argument("--episode", type=int, default=1000, help="episode to train")
+    parser.add_argument("--render", action="store_true", help="render the screen")
+    args = parser.parse_args()
+    main(args.episode, args.render)
 ```

実際のコード添付

2020/11/22 18:41

投稿

退会済みユーザー

スコア0

answer CHANGED Viewed

@@ -21,256 +21,209 @@
 Takahiro Kuboさん：素晴らしいコードを公開いただいた漢気にこの場を使って感謝いたします。ありがとうございます。
 shi.hiさん：強化学習に手を付けるきっかけを頂けました。ありがとうございます。
+envs.py
 ```Python3
-# Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
-# that provided as MIT license by Takahiro Kubo.
-# This was modified from "handson3.py".
-import os
+import gym.spaces
+import numpy as np
-import sys
+import pandas
 import math
-import argparse
+import matplotlib.pyplot as plt
-import gym
+import time
-from trainer import Trainer
-RECORD_PATH = os.path.join(os.path.dirname(__file__), "./upload")
-from collections import defaultdict
-import numpy as np
-####
-class COMMON():
-    # target_env = "myenv-v1" # "CartPole-v0"
-    # target_env = "CartPole-v0"
-    target_env = "myenv-v1"
-    if target_env == "" "CartPole-v0":
-        bins_size = [3, 3, 8, 5] # number of splitted parameters
-        low_bound= [None, -0.5, None, -math.radians(50)] # Limit of minimum value for each parameter
-        high_bound= [None, 0.5, None, math.radians(50)] # Limit of maximum value for each parameter
-    else:
-        bins_size= [5, 5, 5, 5]
-        low_bound= [0, 0, 0, 0] # Limit of minimum value for each parameter
-        high_bound= [4, 4, 4, 4] # Limit of maximum value for each parameter
-####
-# Copied from "q.py"
-class Q():
-    def __init__(self, n_actions, observation_space, bin_size, low_bound=None, high_bound=None, initial_mean=0.0, initial_std=0.0):
-        self.n_actions = n_actions
-        self._observation_dimension = 1
-        for d in observation_space.shape:
-            self._observation_dimension *= d
-        self._bin_sizes = bin_size if isinstance(bin_size, list) else [bin_size] * self._observation_dimension
-        self._dimension_bins = []
-        for i, low, high in self._low_high_iter(observation_space, low_bound, high_bound):
-            b_size = self._bin_sizes[i]
-            bins = self._make_bins(low, high, b_size)
-            self._dimension_bins.append(bins)
-        # if we encounter the new observation, we initialize action evaluations
-        self.table = defaultdict(lambda: initial_std * np.random.randn(self.n_actions) + initial_mean)
-    @classmethod
-    def _make_bins(cls, low, high, bin_size):
-        bins = np.arange(low, high, (float(high) - float(low)) / (bin_size - 2))  # exclude both ends
-        if min(bins) < 0 and 0 not in bins:
-            bins = np.sort(np.append(bins, [0]))  # 0 centric bins
-        return bins
-    @classmethod
-    def _low_high_iter(cls, observation_space, low_bound, high_bound):
-        lows = observation_space.low
-        highs = observation_space.high
-        for i in range(len(lows)):
-            low = lows[i]
-            if low_bound is not None:
-                _low_bound = low_bound if not isinstance(low_bound, list) else low_bound[i]
-                low = low if _low_bound is None else max(low, _low_bound)
-            high = highs[i]
-            if high_bound is not None:
-                _high_bound = high_bound if not isinstance(high_bound, list) else high_bound[i]
-                high = high if _high_bound is None else min(high, _high_bound)
-            yield i, low, high
-    def observation_to_state(self, observation,target_env):
-        if  target_env == "CartPole-v0":
-            state = 0
-            # caution: bin_size over 10 will not work accurately
-            unit = max(self._bin_sizes)
-            for d, o in enumerate(observation.flatten()):
-                state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
-        else:
-            state = 0
-            unit = max(self._bin_sizes)
-            if observation is  None:
-                pass
-            else:
-                for d, o in enumerate(np.asarray(observation).flatten()):
-                    state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
-        return state
-    def values(self, observation,target_env):
-        state = self.observation_to_state(observation,target_env)
-        return self.table[state]
-####
-# Copied from "agent.py"
 import random
-import numpy as np
-class Agent():
+class Game(gym.core.Env):
+#初期条件や各種変数の初期格納する。
-    def __init__(self, q, epsilon=0.05):
+    def __init__(self):
+        self.hunter_Position_X=random.randint(0,5)
+        self.hunter_Position_Y=random.randint(0,5)
+        # print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
+        #selfでグローバル変数化している。鬼のx,y座標をランダムに配置。
+        self.fugitive_Position_X=random.randint(0,5)
+        self.fugitive_Position_Y=random.randint(0,5)
+        # print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
+        #selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。
+        while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
+            self.hunter_Position_X=random.randint(0,5)
+            self.hunter_Position_Y=random.randint(0,5)
+        #print(self.hunter_Position_X,self.hunter_Position_Y)
+        #逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。
+        self.game_count=0
+        #1ゲームで行動できる上限を設定をしている。今回は10回とする。
+        self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
+        # print("初期の距離は"+str(self.initial_distance))
+        #鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
-        self.q = q
+        self.lists = []
-        self.epsilon = epsilon
+        #距離を格納するリスト。
-    def act(self, observation,target_env):
+        self.current_hunter_profit_lists = []
-        # your code here
+        #鬼の報酬を各ステップごとに加える。
-        action = -1
-        if np.random.random() < self.epsilon:
-            action = np.random.choice(self.q.n_actions)
+        self.current_fugitive_profit_lists = []
-        else:
-            action = np.argmax(self.q.values(observation,target_env))
-        return action
-####
-# Copied from "trainer.py"
-from collections import deque
+        #逃亡者の報酬を各ステップごとに加える。
+        self.action_space = gym.spaces.Discrete(4)
+        low = np.array([0, 0, 0, 0])
+        high = np.array([5, 5, 5, 5])
+        self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
+        #逃走エリアを定義している。
+        self.hunter_reward=0
+        self.fugitive_reward=0
+        #鬼と逃亡者の報酬を0で初期化している。
+        self.learn_count=0
+        #学習回数を10000回と制限。
+        self.lists.append(self.initial_distance)
+        #開始時の距離を格納する。
-class Trainer():
+    def step(self,action):
+        self.game_count += 1
+        self.learn_count += 1
+        # print("学習回数は",self.learn_count)
+        if action == 0 and self.hunter_Position_X < 5:
+            self.hunter_Position_X += 1
+        if action == 1 and self.hunter_Position_X > 0:
+            self.hunter_Position_X -= 1
+        if action == 2 and self.hunter_Position_Y < 5:
+            self.hunter_Position_Y += 1
+        if action == 3 and self.hunter_Position_Y > 0:
+            self.hunter_Position_Y -= 1
+        # print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
+        # print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
-    def __init__(self, agent,target_env, gamma=0.95, learning_rate=0.1, learning_rate_decay=None, epsilon=0.05, epsilon_decay=None, max_step=-1):
-        self.agent = agent
-        self.gamma = gamma
-        self.learning_rate = learning_rate
+        #鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
-        self.learning_rate_decay = learning_rate_decay
-        self.epsilon = epsilon
-        self.epsilon_decay = epsilon_decay
-        self.max_step = max_step
-        self.target_env = target_env
-    def train(self, env, episode_count, render=False):
-        default_epsilon = self.agent.epsilon
+        if action == 0 and self.hunter_Position_X == 5:
+            pass
+        if action == 1 and self.hunter_Position_X == 0:
+            pass
+        if action == 2 and self.hunter_Position_Y == 5:
+            pass
+        if action == 3 and self.hunter_Position_Y == 0:
+            pass
+        #例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
+        # time.sleep(0.01)
+        #間隔を0.01秒とする。
+        self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
-        self.agent.epsilon = self.epsilon
+        self.lists.append(self.d)
-        values = []
-        steps = deque(maxlen=100)
-        lr = self.learning_rate
-        for i in range(episode_count):
-            obs = env.reset()
-            step = 0
+        #距離を格納
-            done = False
-            while not done:
-                if render:
-                    # env.render()
-                    print("env.render() is not supported yet.")
-                action = self.agent.act(obs,self.target_env)
-                next_obs, reward, done, _ = env.step(action)
-                state = self.agent.q.observation_to_state(obs,self.target_env)
-                future = 0 if done else np.max(self.agent.q.values(next_obs,self.target_env))
+        self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
-                value = self.agent.q.table[state][action]
+        #鬼と逃亡者の位置を毎回格納する。
-                self.agent.q.table[state][action] += lr * (reward + self.gamma * future - value)
-                obs = next_obs
-                values.append(value)
-                step += 1
-                if self.max_step > 0 and step > self.max_step:
-                    done = True
-            else:
-                mean = np.mean(values)
-                steps.append(step)
-                mean_step = np.mean(steps)
-                print("Episode {}: {}steps(avg{}). epsilon={:.3f}, lr={:.3f}, mean q value={:.2f}".format(
-                    i, step, mean_step, self.agent.epsilon, lr, mean)
-                    )
-                if self.epsilon_decay is not None:
-                    self.agent.epsilon = self.epsilon_decay(self.agent.epsilon, i)
+        hunter_reward,fugitive_reward=self.calc_profit()
-                if self.learning_rate_decay is not None:
+        #報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
-                    lr = self.learning_rate_decay(lr, i)
+        # print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
+        print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))
-def main(episodes, render):
+        is_end = self.reset()
-    env = gym.make(COMMON.target_env)
-    q = Q(
-        env.action_space.n,
+        # print("return値は",np.array(self.observation),hunter_reward,action)
-        env.observation_space,
+        return np.array(self.observation),hunter_reward,action,{}
-        bin_size= COMMON.bins_size,
-        low_bound= COMMON.low_bound,
-        high_bound= COMMON.high_bound
-        )
-    agent = Agent(q, epsilon=0.05)
+        #値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。
-    learning_decay = lambda lr, t: max(0.1, min(0.5, 1.0 - math.log10((t + 1) / 25)))
-    epsilon_decay = lambda eps, t: max(0.01, min(1.0, 1.0 - math.log10((t + 1) / 25)))
-    trainer = Trainer(
+        #if action == 4:
-        agent,
-        target_env = COMMON.target_env,
-        gamma=0.99,
-        learning_rate=0.5, learning_rate_decay=learning_decay,
-        epsilon=1.0, epsilon_decay=epsilon_decay,
+            #self.fugitive_Position_X += 1
-        max_step=250)
+        #if action == 5:
+            #self.fugitive_Position_X -= 1
+        #if action == 6:
+            #self.fugitive_Position_Y += 1
+        #if action == 7:
+            #self.fugitive_Position_Y -= 1
+    def reset_position(self):
+        hunter_Position_X=random.randint(0,5)
+        hunter_Position_Y=random.randint(0,5)
+        fugitive_Position_X=random.randint(0,5)
+        fugitive_Position_Y=random.randint(0,5)
+        while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
+            hunter_Position_X=random.randint(0,5)
+            hunter_Position_Y=random.randint(0,5)
+        print("リセットされました！！！")
+        print()
+        return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
+        #返り値を残しておく。
+        #1ゲームの終了条件を満たしたときに行う指示を記載。
+        #鬼、逃亡者をランダムに配置する。
+    def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
-    trainer.train(env, episode_count=episodes, render=render)
+        distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
+        return distance
+    def calc_profit(self):
+        i= self.game_count
+        if i <= 10 and self.lists[i] == 0:
+            self.hunter_reward += 1
+            self.fugitive_reward -= 1
+            current_hunter_reward = 1
+            current_fugitive_reward = -1
+            self.current_hunter_profit_lists.append(current_hunter_reward)
+            self.current_fugitive_profit_lists.append(current_fugitive_reward)
+            # print("確保成功！！！")
+            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
+            self.game_count = 0
+            self.lists = []
+            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
+            #10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
+        elif i == 10 and (0 not in self.lists):
+            self.hunter_reward -= 1
+            self.fugitive_reward += 1
+            current_hunter_reward = -1
+            current_fugitive_reward = 1
+            self.current_hunter_profit_lists.append(current_hunter_reward)
+            self.current_fugitive_profit_lists.append(current_fugitive_reward)
+            # print("確保失敗！！！")
+            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
+            self.game_count = 0
+            self.lists = []
+            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
+            #10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
+        elif i <= 10 and self.lists[i-1] < self.lists[i]:
+            self.hunter_reward -= 1
+            self.fugitive_reward += 1
+            current_hunter_reward = -1
+            current_fugitive_reward = 1
+            self.current_hunter_profit_lists.append(current_hunter_reward)
+            self.current_fugitive_profit_lists.append(current_fugitive_reward)
-if __name__ == "__main__":
+            # print("逃げられてるよ！！！")
-    parser = argparse.ArgumentParser(description="train & run cartpole ")
+            #前回ステップと今回のステップで距離を比較して報酬を定義している。
-    parser.add_argument("--episode", type=int, default=1000, help="episode to train")
-    parser.add_argument("--render", action="store_true", help="render the screen")
+        elif i <= 10 and self.lists[i-1] > self.lists[i]:
+            self.hunter_reward += 1
+            self.fugitive_reward -= 1
-    args = parser.parse_args()
+            current_hunter_reward = 1
+            current_fugitive_reward = -1
+            self.current_hunter_profit_lists.append(current_hunter_reward)
+            self.current_fugitive_profit_lists.append(current_fugitive_reward)
+            # print("距離を詰めてるね！！！")
+            #前回ステップと今回のステップで距離を比較して報酬を定義している。
+        elif i <= 10 and self.lists[i-1] == self.lists[i]:
+            self.hunter_reward += 0
+            self.fugitive_reward += 0
+            current_hunter_reward = 0
+            current_fugitive_reward = 0
+            self.current_hunter_profit_lists.append(current_hunter_reward)
+            self.current_fugitive_profit_lists.append(current_fugitive_reward)
-    main(args.episode, args.render)
+            # print("距離が変わってないよ！！！")
+            #前回ステップと今回のステップで距離を比較して報酬を定義している。
+        else:
+            pass
-```
+        return current_hunter_reward,current_fugitive_reward
+        #def Linear_function:
+            #Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X
+            #Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X
+            #Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X
+            #Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X
+            #Y = math.sqrt(3)X + b
+        #プログラミングは書いた通りにしか動かない。
+    def reset(self):
-鬼の総合報酬は -5 逃亡者の総合報酬は 5
+        if self.learn_count == 0:
-Episode 295: 2steps(avg1.3). epsilon=0.010, lr=0.100, mean q value=0.04
-学習回数は 440
-鬼の総合報酬は -4 逃亡者の総合報酬は 4
-学習回数は 441
-鬼の総合報酬は -3 逃亡者の総合報酬は 3
-学習回数は 442
-リセットされました！！！
+            is_end = True
+        else:
+            is_end = False
+            #リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。
-鬼の総合報酬は -2 逃亡者の総合報酬は 2
-学習回数は 443
+```
-鬼の総合報酬は -1 逃亡者の総合報酬は 1
-学習回数は 444
-鬼の総合報酬は 0 逃亡者の総合報酬は 0
-Episode 296: 5steps(avg1.34). epsilon=0.010, lr=0.100, mean q value=0.04
-学習回数は 445
-鬼の総合報酬は 1 逃亡者の総合報酬は -1
-学習回数は 446
-鬼の総合報酬は 2 逃亡者の総合報酬は -2
-Episode 297: 2steps(avg1.35). epsilon=0.010, lr=0.100, mean q value=0.04
-学習回数は 447
-鬼の総合報酬は 1 逃亡者の総合報酬は -1
-学習回数は 448
-鬼の総合報酬は 2 逃亡者の総合報酬は -2
-Episode 298: 2steps(avg1.36). epsilon=0.010, lr=0.100, mean q value=0.04
-学習回数は 449
-鬼の総合報酬は 2 逃亡者の総合報酬は -2
-学習回数は 450
-鬼の総合報酬は 2 逃亡者の総合報酬は -2
-学習回数は 451
-鬼の総合報酬は 2 逃亡者の総合報酬は -2
-学習回数は 452
-リセットされました！！！