open ai gymで鬼ごっこの強化学習が上手くいきません。


open ai gymを使用して鬼ごっこの強化学習を作成しています。
まず鬼のみ動かせる状態としています。

``` 発生している問題

ルールは作成済ですが、いざ学習してみると鬼の学習が全然できていない状態です。
10万回学習しても何故か上手く行かないです。
被疑個所としてreturn np.array(self.observation),hunter_reward,action,{}だと考えていますが、他の要因があるか分からないのでご教示いただけないでしょうか。

特になし

``` 該当のソースコード

import gym.spaces
import numpy as np
import pandas
import math
import matplotlib.pyplot as plt
import time
import random

class Game(gym.core.Env):
#初期条件や各種変数の初期格納する。
    def __init__(self):
        self.hunter_Position_X=random.randint(0,5)
        self.hunter_Position_Y=random.randint(0,5)
        print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
        #selfでグローバル変数化している。鬼のx,y座標をランダムに配置。
        self.fugitive_Position_X=random.randint(0,5)
        self.fugitive_Position_Y=random.randint(0,5)
        print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
        #selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。
        while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
            self.hunter_Position_X=random.randint(0,5)
            self.hunter_Position_Y=random.randint(0,5)
        #print(self.hunter_Position_X,self.hunter_Position_Y)
        #逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。 
        self.game_count=0
        #1ゲームで行動できる上限を設定をしている。今回は10回とする。
        self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
        print("初期の距離は"+str(self.initial_distance))
        #鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
        self.lists = []
        #距離を格納するリスト。
        self.current_hunter_profit_lists = []
        #鬼の報酬を各ステップごとに加える。
        self.current_fugitive_profit_lists = []
        #逃亡者の報酬を各ステップごとに加える。

        self.action_space = gym.spaces.Discrete(4)
        low = np.array([0, 0, 0, 0])
        high = np.array([5, 5, 5, 5])
        self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
        #逃走エリアを定義している。
        self.hunter_reward=0
        self.fugitive_reward=0
        #鬼と逃亡者の報酬を0で初期化している。
        self.learn_count=0
        #学習回数を10000回と制限。
        self.lists.append(self.initial_distance)
        #開始時の距離を格納する。
    
    def step(self,action):
        self.game_count += 1
        self.learn_count += 1
        print("学習回数は",self.learn_count)
        if action == 0 and self.hunter_Position_X < 5:
            self.hunter_Position_X += 1
        if action == 1 and self.hunter_Position_X > 0:
            self.hunter_Position_X -= 1
        if action == 2 and self.hunter_Position_Y < 5:
            self.hunter_Position_Y += 1
        if action == 3 and self.hunter_Position_Y > 0:
            self.hunter_Position_Y -= 1
        print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
        print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
        
        #鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
        
        if action == 0 and self.hunter_Position_X == 5:
            pass
        if action == 1 and self.hunter_Position_X == 0:
            pass
        if action == 2 and self.hunter_Position_Y == 5:
            pass
        if action == 3 and self.hunter_Position_Y == 0:
            pass
        #例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
        time.sleep(0.01)
        #間隔を0.01秒とする。
        self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
        self.lists.append(self.d)
        #距離を格納
        self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
        #鬼と逃亡者の位置を毎回格納する。

        hunter_reward,fugitive_reward=self.calc_profit()
        #報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
        print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
        print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))

        is_end = self.reset()

        print("return値は",np.array(self.observation),hunter_reward,action)
        return np.array(self.observation),hunter_reward,action,{}
        #値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。

        #if action == 4:
            #self.fugitive_Position_X += 1
        #if action == 5:
            #self.fugitive_Position_X -= 1
        #if action == 6:
            #self.fugitive_Position_Y += 1
        #if action == 7:
            #self.fugitive_Position_Y -= 1
            
    def reset_position(self):
        hunter_Position_X=random.randint(0,5)
        hunter_Position_Y=random.randint(0,5)
        fugitive_Position_X=random.randint(0,5) 
        fugitive_Position_Y=random.randint(0,5)                   
        while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
            hunter_Position_X=random.randint(0,5)
            hunter_Position_Y=random.randint(0,5)
        print("リセットされました！！！")
        print()
        return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
        #返り値を残しておく。
        #1ゲームの終了条件を満たしたときに行う指示を記載。
        #鬼、逃亡者をランダムに配置する。

    def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
        distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
        return distance

    def calc_profit(self):
        i= self.game_count
        if i <= 10 and self.lists[i] == 0:
            self.hunter_reward += 1
            self.fugitive_reward -= 1
            current_hunter_reward = 1
            current_fugitive_reward = -1
            self.current_hunter_profit_lists.append(current_hunter_reward)
            self.current_fugitive_profit_lists.append(current_fugitive_reward)
            print("確保成功！！！")
            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
            self.game_count = 0
            self.lists = []
            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
            #10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。

        elif i == 10 and (0 not in self.lists):
            self.hunter_reward -= 1
            self.fugitive_reward += 1
            current_hunter_reward = -1
            current_fugitive_reward = 1
            self.current_hunter_profit_lists.append(current_hunter_reward)
            self.current_fugitive_profit_lists.append(current_fugitive_reward)
            print("確保失敗！！！")
            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
            self.game_count = 0
            self.lists = []
            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
            #10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。

        elif i <= 10 and self.lists[i-1] < self.lists[i]:
            self.hunter_reward -= 1
            self.fugitive_reward += 1
            current_hunter_reward = -1
            current_fugitive_reward = 1
            self.current_hunter_profit_lists.append(current_hunter_reward)
            self.current_fugitive_profit_lists.append(current_fugitive_reward)
            print("逃げられてるよ！！！")
            #前回ステップと今回のステップで距離を比較して報酬を定義している。

        elif i <= 10 and self.lists[i-1] > self.lists[i]:
            self.hunter_reward += 1
            self.fugitive_reward -= 1
            current_hunter_reward = 1
            current_fugitive_reward = -1
            self.current_hunter_profit_lists.append(current_hunter_reward)
            self.current_fugitive_profit_lists.append(current_fugitive_reward)
            print("距離を詰めてるね！！！")
            #前回ステップと今回のステップで距離を比較して報酬を定義している。

        elif i <= 10 and self.lists[i-1] == self.lists[i]:
            self.hunter_reward += 0
            self.fugitive_reward += 0
            current_hunter_reward = 0
            current_fugitive_reward = 0
            self.current_hunter_profit_lists.append(current_hunter_reward)
            self.current_fugitive_profit_lists.append(current_fugitive_reward)
            print("距離が変わってないよ！！！")
            #前回ステップと今回のステップで距離を比較して報酬を定義している。

        else:
            pass

        return current_hunter_reward,current_fugitive_reward

        #def Linear_function:
            #Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X
            #Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X
            #Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X
            #Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X
            #Y = math.sqrt(3)X + b
        
        #プログラミングは書いた通りにしか動かない。       
        
    def reset(self):
        if self.learn_count == 0:
            is_end = True
        else:
            is_end = False
            #リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。

```ここに言語名を入力

python

gymにあるcartpoleのコードを参考にしてstep内のreturnを復元してみました。

なし

退会済みユーザー

2020/11/15 09:33

大変興味深い内容で私も興味あります。ですが、teratailの質問(回答)投稿時に、 ```ここに言語名を入力 ``` のところに、 ```Python3 import numpy as np ``` のようにしないとインデントが崩れてしまいます。ですので、取り急ぎこの点のみ修整ください。

退会済みユーザー

2020/11/16 11:47

これ単品ではコードが走らせられませんが、どのように走らせるか教えていただけますか？例：○○関数をif __name__ == "__main__":の後に入れる

shi.hi

2020/11/16 13:09

情報が不足していて申し訳ございません。既に載せたコード以外に下記の別ファイルを作成しています。下記のファイルを実行する事で質問文に記載したコードを走らせています。 import gym #env_dict = gym.envs.registration.registry.env_specs.copy() #for env in env_dict: # if 'myenv-v1' in env: # print("Remove {} from registry".format(env)) # del gym.envs.registration.registry.env_specs[env] from tqdm import tqdm import matplotlib.pyplot as plt env = gym.make('myenv-v1') observation = env.reset() for t in tqdm(range(1000000), leave=False): #env.render() observation, reward, done, info = env.step(env.action_space.sample()) #if reward >= 0: #print("STEP_"+str(t)+" Reward:"+str(reward)) #plt.imshow(observation) plt.show() env.close()

shi.hi

2020/11/16 13:21 編集

#の行は現在使用していないので、整理すると下記となります。 import gym from tqdm import tqdm import matplotlib.pyplot as plt env = gym.make('myenv-v1') observation = env.reset() for t in tqdm(range(1000000), leave=False): 　observation, reward, done, info = env.step(env.action_space.sample()) 　plt.show() env.close()

shi.hi

2020/11/16 13:17

最初の質問文に載せたコード文にコメントを追加しました。何の目的があって記載しているかの説明を簡易的にいたしました。

退会済みユーザー

2020/11/18 09:11

可能な限り手元で再現したいのですが、うまく走らない状況です。フォルダ構成やファイル構成についても差し支えなければ教えていただけますか？

shi.hi

2020/11/18 22:24

最初に載せたコード文の格納先は下記となります。またファイル名はenv.pyとなります。 C:\Users\user\AppData\Local\Programs\Python\Python37\Lib\site-packages\gym\envs\myenv 2020/11/16 22:11に追記したコードのファイルの格納先は任意となります。追加したファイルの位置は以上となります。そこ以外はファイルの位置は変更していません。

退会済みユーザー

2020/11/20 13:39

gym/envs/__init__.pyに register( id='myenv-v1', ... )のように書くと思うのですが（あっていますか？）もしそうであればこれの中身を教えていただけますか？

shi.hi

2020/11/20 15:01

御認識の通りです。__init__.pyに register(id='myenv-v1', entry_point='gym.envs.myenv.env:Game',) を追記しました。

退会済みユーザー

2020/11/21 04:48

まだ何にも進んでいませんが、おそらくGymの学習（ランダムに値を振って重みの学習）をするのが今のコードで、うまく動かすにはこの先の推論(predict)がいりそうな気がします。進展があったら書き込みます。

shi.hi

2020/11/21 07:50

学習が失敗する原因として ①まだコードが不足してるから。(predictが足りていない) ②observation, reward, done, info = env.step(env.action_space.sample())でsample()と記載しているから学習せずに適当になっているから。の2つが考えられると認識しています。 fourteenlength様は前者の可能性が高い認識でしょうか？

退会済みユーザー

2020/11/21 08:26

恐らくこれの両方と思います。env.action_space.sample()は「環境によらず適当な乱数を入れる関数」で、predictは「適当な乱数のうち、環境にうまく作用した乱数（のうちのどれか）を返す関数」だろうと踏んでいるためです。ーー強化学習をやりたいな（でもまだやるにはなぁ）と思っている中でのshi.hiさんの質問でした。私も勉強中の身なので、これだ！という解ではないことをご理解ください。（強化学習は普通の深層学習より触っていて楽しいですね）

shi.hi

2020/11/21 12:05

承知いたしました。自分の方でもpredict関数の実装をしてみます。

退会済みユーザー

2020/11/21 12:52

こちらの方のgithubに実際に動くサンプルがありました。 MITライセンスですので、大人の事情的にも使いやすいと思います。 https://github.com/icoxfog417/techcircle_openai_handson/tree/answer/handson_3 gymでは「環境」を提供するものであって、ユーザー側で環境を乗り倒すことのできる学習関数（Q）を実装しないといけないね、ということみたいですね。gymのドキュメントがもっとあると良いのですが、先人の皆様のブログ等なしにはなしえないですね

退会済みユーザー

2020/11/22 05:16

ちょっと魔改造しましたが動くようになりましたので近いうち（今日の夜には）にUploadしますね。

行動規範の内容に同意します

回答2件

ベストアンサー

env.py

Python3
1import gym.spaces
2import numpy as np
3import pandas
4import math
5import matplotlib.pyplot as plt
6import time
7import random
8
9class Game(gym.core.Env):
10#初期条件や各種変数の初期格納する。
11    def __init__(self):
12        self.hunter_Position_X=random.randint(0,4)
13        self.hunter_Position_Y=random.randint(0,4)
14        # print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
15        #selfでグローバル変数化している。鬼のx,y座標をランダムに配置。
16        self.fugitive_Position_X=random.randint(0,4)
17        self.fugitive_Position_Y=random.randint(0,4)
18        # print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
19        #selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。
20        while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
21            self.hunter_Position_X=random.randint(0,4)
22            self.hunter_Position_Y=random.randint(0,4)
23        #print(self.hunter_Position_X,self.hunter_Position_Y)
24        #逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。 
25        self.game_count=0
26        #1ゲームで行動できる上限を設定をしている。今回は10回とする。
27        self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
28        # print("初期の距離は"+str(self.initial_distance))
29        #鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
30        self.lists = []
31        #距離を格納するリスト。
32        self.current_hunter_profit_lists = []
33        #鬼の報酬を各ステップごとに加える。
34        self.current_fugitive_profit_lists = []
35        #逃亡者の報酬を各ステップごとに加える。
36
37        self.action_space = gym.spaces.Discrete(4)
38        low = np.array([0, 0, 0, 0])
39        high = np.array([4, 4, 4, 4])
40        self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
41        #逃走エリアを定義している。
42        self.hunter_reward=0
43        self.fugitive_reward=0
44        #鬼と逃亡者の報酬を0で初期化している。
45        self.learn_count=0
46        #学習回数を10000回と制限。
47        self.lists.append(self.initial_distance)
48        #開始時の距離を格納する。
49
50    def step(self,action):
51        self.game_count += 1
52        self.learn_count += 1
53        # print("学習回数は",self.learn_count)
54
55        if action == 0 and self.hunter_Position_X < 5:
56            self.hunter_Position_X += 1
57        if action == 1 and self.hunter_Position_X >= 0:
58            self.hunter_Position_X -= 1
59        if action == 2 and self.hunter_Position_Y < 5:
60            self.hunter_Position_Y += 1
61        if action == 3 and self.hunter_Position_Y >= 0:
62            self.hunter_Position_Y -= 1
63        # print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
64        # print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
65
66        #鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
67
68        if action == 0 and self.hunter_Position_X == 5:
69            pass
70        if action == 1 and self.hunter_Position_X == -1:
71            pass
72        if action == 2 and self.hunter_Position_Y == 5:
73            pass
74        if action == 3 and self.hunter_Position_Y == -1:
75            pass
76        #例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
77        # time.sleep(0.01)
78        #間隔を0.01秒とする。
79        self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
80        self.lists.append(self.d)
81        #距離を格納
82        self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
83        #鬼と逃亡者の位置を毎回格納する。
84
85        hunter_reward,fugitive_reward=self.calc_profit()
86        #報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
87        # print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
88        print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))
89
90        is_end = self.reset()
91
92        # print("return値は",np.array(self.observation),hunter_reward,action)
93        return np.array(self.observation),hunter_reward,action,{}
94        #値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。
95
96        #if action == 4:
97            #self.fugitive_Position_X += 1
98        #if action == 5:
99            #self.fugitive_Position_X -= 1
100        #if action == 6:
101            #self.fugitive_Position_Y += 1
102        #if action == 7:
103            #self.fugitive_Position_Y -= 1
104
105    def reset_position(self):
106        hunter_Position_X=random.randint(0,4)
107        hunter_Position_Y=random.randint(0,4)
108        fugitive_Position_X=random.randint(0,4) 
109        fugitive_Position_Y=random.randint(0,4)                   
110        while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
111            hunter_Position_X=random.randint(0,4)
112            hunter_Position_Y=random.randint(0,4)
113        print("リセットされました！！！")
114        print()
115        return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
116        #返り値を残しておく。
117        #1ゲームの終了条件を満たしたときに行う指示を記載。
118        #鬼、逃亡者をランダムに配置する。
119
120    def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
121        distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
122        return distance
123
124    def calc_profit(self):
125        i= self.game_count
126        if i <= 10 and self.lists[i] == 0:
127            self.hunter_reward += 1
128            self.fugitive_reward -= 1
129            current_hunter_reward = 1
130            current_fugitive_reward = -1
131            self.current_hunter_profit_lists.append(current_hunter_reward)
132            self.current_fugitive_profit_lists.append(current_fugitive_reward)
133            # print("確保成功！！！")
134            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
135            self.game_count = 0
136            self.lists = []
137            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
138            #10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
139
140        elif i == 10 and (0 not in self.lists):
141            self.hunter_reward -= 1
142            self.fugitive_reward += 1
143            current_hunter_reward = -1
144            current_fugitive_reward = 1
145            self.current_hunter_profit_lists.append(current_hunter_reward)
146            self.current_fugitive_profit_lists.append(current_fugitive_reward)
147            # print("確保失敗！！！")
148            self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
149            self.game_count = 0
150            self.lists = []
151            self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
152            #10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
153
154        elif i <= 10 and self.lists[i-1] < self.lists[i]:
155            self.hunter_reward -= 1
156            self.fugitive_reward += 1
157            current_hunter_reward = -1
158            current_fugitive_reward = 1
159            self.current_hunter_profit_lists.append(current_hunter_reward)
160            self.current_fugitive_profit_lists.append(current_fugitive_reward)
161            # print("逃げられてるよ！！！")
162            #前回ステップと今回のステップで距離を比較して報酬を定義している。
163
164        elif i <= 10 and self.lists[i-1] > self.lists[i]:
165            self.hunter_reward += 1
166            self.fugitive_reward -= 1
167            current_hunter_reward = 1
168            current_fugitive_reward = -1
169            self.current_hunter_profit_lists.append(current_hunter_reward)
170            self.current_fugitive_profit_lists.append(current_fugitive_reward)
171            # print("距離を詰めてるね！！！")
172            #前回ステップと今回のステップで距離を比較して報酬を定義している。
173
174        elif i <= 10 and self.lists[i-1] == self.lists[i]:
175            self.hunter_reward += 0
176            self.fugitive_reward += 0
177            current_hunter_reward = 0
178            current_fugitive_reward = 0
179            self.current_hunter_profit_lists.append(current_hunter_reward)
180            self.current_fugitive_profit_lists.append(current_fugitive_reward)
181            # print("距離が変わってないよ！！！")
182            #前回ステップと今回のステップで距離を比較して報酬を定義している。
183
184        else:
185            pass
186
187        return current_hunter_reward,current_fugitive_reward
188
189        #def Linear_function:
190            #Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X
191            #Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X
192            #Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X
193            #Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X
194            #Y = math.sqrt(3)X + b
195
196        #プログラミングは書いた通りにしか動かない。       
197
198    def reset(self):
199        if self.learn_count == 0:
200            is_end = True
201        else:
202            is_end = False
203            #リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。
204

投稿2020/11/22 18:39

編集2020/11/23 09:33

退会済みユーザー

総合スコア0

追記:
この解答欄のrunner.pyと別の回答のenvs.pyを使ってください。

■動かすまで
コメントにも書きましたが、gym側は環境を提供するしかできないため、自前でQ学習をする環境を用意してあげないといけないようです。

以下のコードに適当な名前を付けて保存してください。保存先は「gym-master」直下が良いと思います。
500回くらい学習するとepsilonとlearning rateが底をつきます。その頃合いから鬼がちゃんと追いかけるようになります。

■技術的注意事項
bins_size、low_bound、high_boundが非常に重要で、恐らくこれさえちゃんとできていれば相当使いまわしができるのではないかと思います。

変数	説明
bins_size	要素数=行動の個数、要素の値=行動の取りえる選択肢数
low_bound	行動の取りえる選択肢の最小値
high_bound	行動の取りえる選択肢の最大値

この「行動の取りえる選択肢数」は離散値である必要があるそうで、本当はいろいろめんどくさいことをやらないといけませんが、Takahiro Kuboさんのコードがそれを吸収してくれます。コードをひとまとめにした方が扱いやすいため、コピペで１つにまとめました。

■補足
time.sleepはない方がスムーズでいいかもしれません。

■謝辞
Takahiro Kuboさん：素晴らしいコードを公開いただいた漢気にこの場を使って感謝いたします。ありがとうございます。
shi.hiさん：強化学習に手を付けるきっかけを頂けました。ありがとうございます。

runner.py

Python3
1# Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
2# that provided as MIT license by Takahiro Kubo.
3# This was modified from "handson3.py".
4
5import os
6import sys
7
8import math
9import argparse
10import gym
11
12
13RECORD_PATH = os.path.join(os.path.dirname(__file__), "./upload")
14from collections import defaultdict
15import numpy as np
16
17####
18class COMMON():
19    # target_env = "myenv-v1" # "CartPole-v0"
20    # target_env = "CartPole-v0"
21    target_env = "myenv-v1"
22
23    if target_env == "" "CartPole-v0":
24        bins_size = [3, 3, 8, 5] # number of splitted parameters
25        low_bound= [None, -0.5, None, -math.radians(50)] # Limit of minimum value for each parameter
26        high_bound= [None, 0.5, None, math.radians(50)] # Limit of maximum value for each parameter
27
28    else:
29        bins_size= [5, 5, 5, 5] 
30        low_bound= [0, 0, 0, 0] # Limit of minimum value for each parameter
31        high_bound= [4, 4, 4, 4] # Limit of maximum value for each parameter
32####
33# Copied from "q.py"
34class Q():
35
36    def __init__(self, n_actions, observation_space, bin_size, low_bound=None, high_bound=None, initial_mean=0.0, initial_std=0.0):
37        self.n_actions = n_actions
38        self._observation_dimension = 1
39        for d in observation_space.shape:
40            self._observation_dimension *= d
41
42        self._bin_sizes = bin_size if isinstance(bin_size, list) else [bin_size] * self._observation_dimension
43        self._dimension_bins = []
44        for i, low, high in self._low_high_iter(observation_space, low_bound, high_bound):
45            b_size = self._bin_sizes[i]
46            bins = self._make_bins(low, high, b_size)
47            self._dimension_bins.append(bins)
48
49        # if we encounter the new observation, we initialize action evaluations
50        self.table = defaultdict(lambda: initial_std * np.random.randn(self.n_actions) + initial_mean)
51
52    @classmethod
53    def _make_bins(cls, low, high, bin_size):
54        bins = np.arange(low, high, (float(high) - float(low)) / (bin_size - 2))  # exclude both ends
55        if min(bins) < 0 and 0 not in bins:
56            bins = np.sort(np.append(bins, [0]))  # 0 centric bins
57        return bins
58
59    @classmethod
60    def _low_high_iter(cls, observation_space, low_bound, high_bound):
61        lows = observation_space.low
62        highs = observation_space.high
63        for i in range(len(lows)):
64            low = lows[i]
65            if low_bound is not None:
66                _low_bound = low_bound if not isinstance(low_bound, list) else low_bound[i]
67                low = low if _low_bound is None else max(low, _low_bound)
68
69            high = highs[i]
70            if high_bound is not None:
71                _high_bound = high_bound if not isinstance(high_bound, list) else high_bound[i]
72                high = high if _high_bound is None else min(high, _high_bound)
73
74            yield i, low, high
75
76    def observation_to_state(self, observation,target_env):
77        if  target_env == "CartPole-v0":
78            state = 0
79            # caution: bin_size over 10 will not work accurately
80            unit = max(self._bin_sizes)
81
82            for d, o in enumerate(observation.flatten()):
83                state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
84        else:
85
86            state = 0
87            unit = max(self._bin_sizes)
88
89            if observation is  None:
90                pass
91            else:
92                for d, o in enumerate(np.asarray(observation).flatten()):
93                    state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d)  # bin_size numeral system
94        return state
95
96    def values(self, observation,target_env):
97        state = self.observation_to_state(observation,target_env)
98        return self.table[state]
99
100####
101# Copied from "agent.py"
102import random
103import numpy as np
104class Agent():
105
106    def __init__(self, q, epsilon=0.05):
107        self.q = q
108        self.epsilon = epsilon
109
110    def act(self, observation,target_env):
111        # your code here
112        action = -1
113        if np.random.random() < self.epsilon:
114            action = np.random.choice(self.q.n_actions)
115        else:
116            action = np.argmax(self.q.values(observation,target_env))
117
118        return action
119
120####
121# Copied from "trainer.py"
122from collections import deque
123
124
125class Trainer():
126
127    def __init__(self, agent,target_env, gamma=0.95, learning_rate=0.1, learning_rate_decay=None, epsilon=0.05, epsilon_decay=None, max_step=-1):
128        self.agent = agent
129        self.target_env = target_env
130        self.gamma = gamma
131        self.learning_rate = learning_rate
132        self.learning_rate_decay = learning_rate_decay
133        self.epsilon = epsilon
134        self.epsilon_decay = epsilon_decay
135        self.max_step = max_step
136
137    def train(self, env, episode_count, render=False):
138        default_epsilon = self.agent.epsilon
139        self.agent.epsilon = self.epsilon
140        values = []
141        steps = deque(maxlen=100)
142        lr = self.learning_rate
143        for i in range(episode_count):
144            obs = env.reset()
145            step = 0
146            done = False
147            while not done:
148                if render:
149                    if self.target_env ==  "myenv-v1":
150                        print("Not supported yet.")
151                    else:
152                        env.render()
153
154                action = self.agent.act(obs,self.target_env)
155                next_obs, reward, done, _ = env.step(action)
156
157                state = self.agent.q.observation_to_state(obs,self.target_env)
158                future = 0 if done else np.max(self.agent.q.values(next_obs,self.target_env))
159                value = self.agent.q.table[state][action]
160                self.agent.q.table[state][action] += lr * (reward + self.gamma * future - value)
161
162                obs = next_obs
163                values.append(value)
164                step += 1
165                if self.max_step > 0 and step > self.max_step:
166                    done = True
167            else:
168                mean = np.mean(values)
169                steps.append(step)
170                mean_step = np.mean(steps)
171                print("Episode {}: {}steps(avg{}). epsilon={:.3f}, lr={:.3f}, mean q value={:.2f}".format(
172                    i, step, mean_step, self.agent.epsilon, lr, mean)
173                    )
174                
175                if self.epsilon_decay is not None:
176                    self.agent.epsilon = self.epsilon_decay(self.agent.epsilon, i)
177                if self.learning_rate_decay is not None:
178                    lr = self.learning_rate_decay(lr, i)
179
180
181
182
183def main(episodes, render):
184    env = gym.make(COMMON.target_env) 
185
186    q = Q(
187        env.action_space.n, 
188        env.observation_space, 
189        bin_size= COMMON.bins_size,
190        low_bound= COMMON.low_bound,
191        high_bound= COMMON.high_bound
192        )
193    agent = Agent(q, epsilon=0.05)
194
195    learning_decay = lambda lr, t: max(0.1, min(0.5, 1.0 - math.log10((t + 1) / 25)))
196    epsilon_decay = lambda eps, t: max(0.01, min(1.0, 1.0 - math.log10((t + 1) / 25)))
197    trainer = Trainer(
198        agent,
199        target_env = COMMON.target_env,
200        gamma=0.99,
201        learning_rate=0.5, learning_rate_decay=learning_decay, 
202        epsilon=1.0, epsilon_decay=epsilon_decay,
203        max_step=250)
204
205
206    trainer.train(env, episode_count=episodes, render=render)
207
208
209
210if __name__ == "__main__":
211    parser = argparse.ArgumentParser(description="train & run cartpole ")
212    parser.add_argument("--episode", type=int, default=1000, help="episode to train")
213    parser.add_argument("--render", action="store_true", help="render the screen")
214
215    args = parser.parse_args()
216
217    main(args.episode, args.render)
218

投稿2020/11/22 08:37

編集2020/11/22 18:42

退会済みユーザー

総合スコア0

shi.hi

2020/11/22 13:09

御回答いただきありがとうございます。内容を確認中ですが、不明点があったらご質問させていただく事があると思います。

退会済みユーザー

2020/11/22 13:10

重くて編集できませんでしたが「from trainer import Trainer」の行はなくても動く（あるとエラーが出るので）削除してください。

shi.hi

2020/11/22 13:18

承知しました。今記載していただいたコードを保存しました(格納場所はどこだか回答から特定出来ていなかったので適当です) runさせるとfrom trainer import Trainerの行でERRが発生したので、回答にない何かしらのオペレーションをしたのかな？と推測しているところでした（笑）

shi.hi

2020/11/22 13:20

コメントアウトすると動作する事を確認いたしました。

shi.hi

2020/11/22 13:55

ちなみにですが、env.py(ルールや報酬を定義したファイル)に何かしら加筆修正はしたでしょうか？

退会済みユーザー

2020/11/22 18:13 編集

時間を短くしたくてsleepをコメントアウトしたり、最終的な評価値だけ見たくてprint()はコメントアウトしたと思いますが他はそのままだったと思います。投稿欄に投稿できるようであれば今動く全文をアップロードします。

shi.hi

2020/11/23 00:03

少しルールを加えてみました。 import gym.spaces import numpy as np import pandas import math import matplotlib.pyplot as plt import time import random class Game(gym.core.Env): #初期条件や各種変数の初期格納する。 def __init__(self): self.hunter_Position_X=random.randint(0,5) self.hunter_Position_Y=random.randint(0,5) print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y) #selfでグローバル変数化している。鬼のx,y座標をランダムに配置。 self.fugitive_Position_X=random.randint(0,5) self.fugitive_Position_Y=random.randint(0,5) print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y) #selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。 while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y: self.hunter_Position_X=random.randint(0,5) self.hunter_Position_Y=random.randint(0,5) #print(self.hunter_Position_X,self.hunter_Position_Y) #逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。 self.game_count=0 #1ゲームで行動できる上限を設定をしている。今回は10回とする。 self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2)) print("初期の距離は"+str(self.initial_distance)) #鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。 self.lists = [] #距離を格納するリスト。 self.current_hunter_profit_lists = [] #鬼の報酬を各ステップごとに加える。 self.current_fugitive_profit_lists = [] #逃亡者の報酬を各ステップごとに加える。 self.action_space = gym.spaces.Discrete(4) low = np.array([0, 0, 0, 0]) high = np.array([5, 5, 5, 5]) self.observation_space = gym.spaces.Box(low, high, dtype=np.int64) #逃走エリアを定義している。 self.hunter_reward=0 self.fugitive_reward=0 #鬼と逃亡者の報酬を0で初期化している。 self.learn_count=0 #学習回数を10000回と制限。 self.lists.append(self.initial_distance) #開始時の距離を格納する。 def step(self,action): self.game_count += 1 self.learn_count += 1 print("学習回数は",self.learn_count) if action == 0 and self.hunter_Position_X < 5: self.hunter_Position_X += 1 if action == 1 and self.hunter_Position_X > 0: self.hunter_Position_X -= 1 if action == 2 and self.hunter_Position_Y < 5: self.hunter_Position_Y += 1 if action == 3 and self.hunter_Position_Y > 0: self.hunter_Position_Y -= 1 print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y) print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y) #鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。 if action == 0 and self.hunter_Position_X == 5: action = 4 if action == 1 and self.hunter_Position_X == 0: action = 4 if action == 2 and self.hunter_Position_Y == 5: action = 4 if action == 3 and self.hunter_Position_Y == 0: action = 4 self.action_number = [] self.action_number = action print("今回のself.action_numberは"+str(self.action_number)) #例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。 time.sleep(0.1) #間隔を0.01秒とする。 self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y) self.lists.append(self.d) #距離を格納 self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y) #鬼と逃亡者の位置を毎回格納する。 hunter_reward,fugitive_reward=self.calc_profit() #報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。 print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward)) print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists)) is_end = self.reset() print("return値は",np.array(self.observation),hunter_reward,action) print() return np.array(self.observation),hunter_reward,action,{} #値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。 #if action == 4: #self.fugitive_Position_X += 1 #if action == 5: #self.fugitive_Position_X -= 1 #if action == 6: #self.fugitive_Position_Y += 1 #if action == 7: #self.fugitive_Position_Y -= 1 def reset_position(self): hunter_Position_X=random.randint(0,5) hunter_Position_Y=random.randint(0,5) fugitive_Position_X=random.randint(0,5) fugitive_Position_Y=random.randint(0,5) while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y: hunter_Position_X=random.randint(0,5) hunter_Position_Y=random.randint(0,5) print("リセットされました！！！") print() return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y #返り値を残しておく。 #1ゲームの終了条件を満たしたときに行う指示を記載。 #鬼、逃亡者をランダムに配置する。 def cal_distance(self , h_X , h_Y ,f_X ,f_Y): distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2)) return distance def calc_profit(self): i= self.game_count if i <= 10 and self.lists[i] == 0: self.hunter_reward += 1 self.fugitive_reward -= 1 current_hunter_reward = 1 current_fugitive_reward = -1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("確保成功！！！") self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position() self.game_count = 0 self.lists = [] self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)) #10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。 elif i == 10 and (0 not in self.lists): self.hunter_reward -= 1 self.fugitive_reward += 1 current_hunter_reward = -1 current_fugitive_reward = 1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("確保失敗！！！") self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position() self.game_count = 0 self.lists = [] self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)) #10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。 elif i <= 10 and self.lists[i-1] < self.lists[i]: self.hunter_reward -= 1 self.fugitive_reward += 1 current_hunter_reward = -1 current_fugitive_reward = 1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("逃げられてるよ！！！") #前回ステップと今回のステップで距離を比較して報酬を定義している。上記は距離が縮まっている場合。 elif i <= 10 and self.lists[i-1] > self.lists[i]: self.hunter_reward += 1 self.fugitive_reward -= 1 current_hunter_reward = 1 current_fugitive_reward = -1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("距離を詰めてるね！！！") #前回ステップと今回のステップで距離を比較して報酬を定義している。上記は距離が縮まっていない場合。 elif self.action_number == 4: self.hunter_reward -= 0.1 self.fugitive_reward += 0.1 current_hunter_reward = -0.1 current_fugitive_reward = 0.1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("行動が間違っているよ！！！") elif i <= 10 and self.lists[i-1] == self.lists[i]: self.hunter_reward += 0 self.fugitive_reward += 0 current_hunter_reward = 0 current_fugitive_reward = 0 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("距離が変わってないよ！！！") #前回ステップと今回のステップで距離を比較して報酬を定義している。上記は距離が変わらない場合。 return current_hunter_reward,current_fugitive_reward #def Linear_function: #Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X #Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X #Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X #Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X #Y = math.sqrt(3)X + b #プログラミングは書いた通りにしか動かない。 def reset(self): if self.learn_count == 0: is_end = True else: is_end = False #リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。

shi.hi

2020/11/23 00:05

変更箇所は if action == 0 and self.hunter_Position_X == 5: action = 4 if action == 1 and self.hunter_Position_X == 0: action = 4 if action == 2 and self.hunter_Position_Y == 5: action = 4 if action == 3 and self.hunter_Position_Y == 0: action = 4 self.action_number = [] self.action_number = action print("今回のself.action_numberは"+str(self.action_number)) と意味ない行動した時のaction番号を4と定義しました。また報酬も下記の追記をしました。 elif self.action_number == 4: self.hunter_reward -= 0.1 self.fugitive_reward += 0.1 current_hunter_reward = -0.1 current_fugitive_reward = 0.1 self.current_hunter_profit_lists.append(current_hunter_reward) self.current_fugitive_profit_lists.append(current_fugitive_reward) print("行動が間違っているよ！！！")

shi.hi

2020/11/23 00:08

この条件の場合 bins_size= [5, 5, 5, 5] low_bound= [0, 0, 0, 0] # Limit of minimum value for each parameter high_bound= [4, 4, 4, 4] # Limit of maximum value for each parameter はどう変わるでしょうか・・・・・・？

退会済みユーザー

2020/11/23 03:26 編集

＠env.pyの位置指定関係初期位置でrandom.randint(0,5)とありますが、これだとマスの位置が0,1,2,3,4,5の6種類になるので、これはrandom.randint(0,4)が正解と思います。同様に、観測地の最大値が収まるhigh = np.array([5, 5, 5, 5])は[4, 4, 4, 4]が正解と思います。＠env.pyのstep if action == 0 and self.hunter_Position_X < 5: if action == 1 and self.hunter_Position_X >= 0: if action == 2 and self.hunter_Position_Y < 5: if action == 3 and self.hunter_Position_Y >= 0: の範囲内にあるときにはハンターの位置を移動が正解と思います。 5以下については以下のような指定であれば、大きい側にはみ出すことはできますが、 0より大きい、の条件の場合、小さい側に意図的に起こしたい「無駄な移動」が発生できないためです。 if action == 1 and self.hunter_Position_X > 0: #0より大きい時には移動後に0になりえる（0の場合にはこの条件を満たせないので、移動後の-1：無駄な移動が起きない）これを解消するには if action == 1 and self.hunter_Position_X >= 0: としないといけない。＠Q学習のコード bins_size=[7, 7, 7, 7]かもしれません。値の範囲については-1,0,1,2,3,4,5の7種類を取りえるためです。要素の数は、ハンターのXYと逃亡者のXYで４つになります。 low_bound= [-1, -1, -1, -1]と思います。上記の-1が要素の取りえる最小値のためです。同様に、 high_bound= [5, 5, 5, 5]と思います。 ※動きが不安定な上にまだうまくいっておりません。間違っていたら教えて下さい。原因は高すぎる学習率ではないかと思いますが、まだその調整まで至っていません。

shi.hi

2020/11/23 06:12

5×5のエリアを想定していたので、random.randint(0,4)が正解で high = np.array([4, 4, 4, 4])に修正しました。＠env.pyのstep if action == 0 and self.hunter_Position_X < 5: if action == 1 and self.hunter_Position_X >= 0: if action == 2 and self.hunter_Position_Y < 5: if action == 3 and self.hunter_Position_Y >= 0: に修正いたしました。

退会済みユーザー

2020/11/24 10:59 編集

1．無駄な移動の扱いについて action = 4のくだりはなくていいですね。actionは人間ではなくてプログラムに決めさせるべきだからです。その代わりに、望ましくない行動は減点するようにしこまないといけません。env.pyのcalc_profit()の中で「どれでもない」に該当するelseを加えてそこで減点すればOKです。２．距離が縮まない時の扱い > ※動きが不安定な上にまだうまくいっておりません。これの原因が「距離が縮んでいない」の時の扱いのようでした。弱いペナルティ(-0.5)をするといくらか収束する気配が見えてきました。学習のスケール感と私の理解は以下の感じです。 15000エピソード：無駄な移動を理解し始める 20000エピソード：距離が縮んでいない時を理解し始める 25000エピソード：点数の減りが落ち着き始め、少しずつ-3000くらいから上がり始める 30000エピソード：-1500くらいまで縮まる 50000エピソード： -600くらいまで縮まる

shi.hi

2020/11/24 23:25

試しに10万エピソードまでやってみましたが、報酬が+になっている事を確認しました。可視化してもう少し分かりやすくしようと思います。

退会済みユーザー

2020/11/25 21:47

lr_decay= lambda lr, t: init_lr * (1/ (a* math.exp(b* t))) みたいな感じで減衰させた方が圧倒的に収束が早いです。lrは0.2から少し下がる感じで調整すると良いと思います。a=1,b=0.00001とかでbは要桁数調整です。 epsilonも同様な方法で適当に調整が望ましいです。

退会済みユーザー

2020/11/26 09:23

epsilonはほぼゼロでよさそうです。0.05固定でもいいかもしれません。ちゃんと調整すれば鬼のスコアは1.5万エピソードもあればプラスになります。

shi.hi

2020/11/27 06:58

更新されている事に今気が付きました。 runner.pyの理解がまだ出来ていないのですが、承知しました。

shi.hi

2020/11/27 07:06

ちなみにですが、特に報酬値の変更はせずに「runner.py」のみ変更している認識でよろしいでしょうか。

退会済みユーザー

2020/11/27 10:27

そうなります。学習は遅いのはrunner.pyのepsilonとlearningrateの減衰のためです。既定の条件だとあの問題にはあっていないようですので、その点は修正した方が良いです。報酬値に関しては触る必要はないかと思いますが、「距離が変わっていないよ」の時に弱い罰則があった方が収束は早かったと思います。

shi.hi

2021/01/06 12:54

学習をさせると現状、距離を詰めればいい事は分かっていそうだが捕まえればいい事までは分かっていなさそうに見えます。報酬の設定が不十分なのでしょうか？

退会済みユーザー

2021/01/06 21:46

もし移動できる空間を広くして「距離を詰めればいい事は分かっていそうだが捕まえればいい事までは分かっていなさそう」であれば、捕まえた時の報酬を少し上げるか近づいた時の報酬を少し下げればよさそうな気がします。学習でやっていることは「○○のパラメータの時に△△と動いたら報酬が上がった」を記録しているだけですので、「隣り合っているときに捕まえたら報酬が上がった」を教え込む流れでよいと思います。

shi.hi

2021/01/07 23:40

御回答ありがとうございます。報酬の設定をもう少し変えてみます。

shi.hi

2021/08/17 16:15

エリアを徐々に拡大したり報酬を変化させる等の切り分けを試したのですが、どうも上手く行かない状態です。 1つ原因として予想されるのがエージェントに視覚情報が与えられていない事だと考えられます。要は真っ暗闇の中で強化学習をしてしまっているが原因だと疑っています。現状視覚をエージェントに情報として与えるにはどうプログラミングすればいいのかが分からず、詰まっています。考え方をご教示いただけないでしょうか。

退会済みユーザー

2021/08/18 14:38

以前のコードから変更もあるかと思いますので、今のコードを新しい質問として投稿した方が良いと思います。もっと詳しい方の説明ももらえるかもしれません。Qleaningというよりも、ゲームプログラミングに近いような内容と思いますので、そっち系の詳しい方に見てもらえるような工夫があるとコメントをもらいやすいかもしれません。

行動規範の内容に同意します

あなたの回答

tips

プレビュー

行動規範の内容に同意します

質問の解決につながる回答をしましょう。サンプルコードなど、より具体的な説明があると質問者の理解の助けになります。また、読む側のことを考えた、分かりやすい文章を心がけましょう。

15分調べてもわからないことは
teratailで質問しよう！

ただいまの回答率
85.30%

質問をまとめることで
思考を整理して素早く解決

テンプレート機能で
簡単に質問をまとめる

質問する

関連した質問