回答編集履歴
2
追記
test
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
**追記:**
|
2
|
+
|
3
|
+
この解答欄のrunner.pyと別の回答のenvs.pyを使ってください。
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
---
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
|
12
|
+
|
1
13
|
**■動かすまで**
|
2
14
|
|
3
15
|
コメントにも書きましたが、gym側は環境を提供するしかできないため、自前でQ学習をする環境を用意してあげないといけないようです。
|
@@ -26,6 +38,8 @@
|
|
26
38
|
|
27
39
|
|
28
40
|
|
41
|
+
|
42
|
+
|
29
43
|
この「行動の取りえる選択肢数」は離散値である必要があるそうで、本当はいろいろめんどくさいことをやらないといけませんが、Takahiro Kuboさんのコードがそれを吸収してくれます。コードをひとまとめにした方が扱いやすいため、コピペで1つにまとめました。
|
30
44
|
|
31
45
|
|
@@ -44,413 +58,443 @@
|
|
44
58
|
|
45
59
|
|
46
60
|
|
47
|
-
e
|
61
|
+
runner.py
|
48
62
|
|
49
63
|
```Python3
|
50
64
|
|
65
|
+
# Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
|
66
|
+
|
67
|
+
# that provided as MIT license by Takahiro Kubo.
|
68
|
+
|
69
|
+
# This was modified from "handson3.py".
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
import os
|
74
|
+
|
75
|
+
import sys
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
import math
|
80
|
+
|
51
|
-
import g
|
81
|
+
import argparse
|
82
|
+
|
83
|
+
import gym
|
84
|
+
|
85
|
+
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
RECORD_PATH = os.path.join(os.path.dirname(__file__), "./upload")
|
90
|
+
|
91
|
+
from collections import defaultdict
|
52
92
|
|
53
93
|
import numpy as np
|
54
94
|
|
95
|
+
|
96
|
+
|
97
|
+
####
|
98
|
+
|
99
|
+
class COMMON():
|
100
|
+
|
101
|
+
# target_env = "myenv-v1" # "CartPole-v0"
|
102
|
+
|
103
|
+
# target_env = "CartPole-v0"
|
104
|
+
|
105
|
+
target_env = "myenv-v1"
|
106
|
+
|
107
|
+
|
108
|
+
|
109
|
+
if target_env == "" "CartPole-v0":
|
110
|
+
|
111
|
+
bins_size = [3, 3, 8, 5] # number of splitted parameters
|
112
|
+
|
113
|
+
low_bound= [None, -0.5, None, -math.radians(50)] # Limit of minimum value for each parameter
|
114
|
+
|
115
|
+
high_bound= [None, 0.5, None, math.radians(50)] # Limit of maximum value for each parameter
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
else:
|
120
|
+
|
121
|
+
bins_size= [5, 5, 5, 5]
|
122
|
+
|
123
|
+
low_bound= [0, 0, 0, 0] # Limit of minimum value for each parameter
|
124
|
+
|
125
|
+
high_bound= [4, 4, 4, 4] # Limit of maximum value for each parameter
|
126
|
+
|
127
|
+
####
|
128
|
+
|
129
|
+
# Copied from "q.py"
|
130
|
+
|
131
|
+
class Q():
|
132
|
+
|
133
|
+
|
134
|
+
|
135
|
+
def __init__(self, n_actions, observation_space, bin_size, low_bound=None, high_bound=None, initial_mean=0.0, initial_std=0.0):
|
136
|
+
|
137
|
+
self.n_actions = n_actions
|
138
|
+
|
139
|
+
self._observation_dimension = 1
|
140
|
+
|
141
|
+
for d in observation_space.shape:
|
142
|
+
|
143
|
+
self._observation_dimension *= d
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
self._bin_sizes = bin_size if isinstance(bin_size, list) else [bin_size] * self._observation_dimension
|
148
|
+
|
149
|
+
self._dimension_bins = []
|
150
|
+
|
151
|
+
for i, low, high in self._low_high_iter(observation_space, low_bound, high_bound):
|
152
|
+
|
153
|
+
b_size = self._bin_sizes[i]
|
154
|
+
|
155
|
+
bins = self._make_bins(low, high, b_size)
|
156
|
+
|
157
|
+
self._dimension_bins.append(bins)
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
# if we encounter the new observation, we initialize action evaluations
|
162
|
+
|
163
|
+
self.table = defaultdict(lambda: initial_std * np.random.randn(self.n_actions) + initial_mean)
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
@classmethod
|
168
|
+
|
169
|
+
def _make_bins(cls, low, high, bin_size):
|
170
|
+
|
171
|
+
bins = np.arange(low, high, (float(high) - float(low)) / (bin_size - 2)) # exclude both ends
|
172
|
+
|
173
|
+
if min(bins) < 0 and 0 not in bins:
|
174
|
+
|
175
|
+
bins = np.sort(np.append(bins, [0])) # 0 centric bins
|
176
|
+
|
55
|
-
|
177
|
+
return bins
|
178
|
+
|
179
|
+
|
180
|
+
|
56
|
-
|
181
|
+
@classmethod
|
182
|
+
|
183
|
+
def _low_high_iter(cls, observation_space, low_bound, high_bound):
|
184
|
+
|
185
|
+
lows = observation_space.low
|
186
|
+
|
187
|
+
highs = observation_space.high
|
188
|
+
|
189
|
+
for i in range(len(lows)):
|
190
|
+
|
191
|
+
low = lows[i]
|
192
|
+
|
193
|
+
if low_bound is not None:
|
194
|
+
|
195
|
+
_low_bound = low_bound if not isinstance(low_bound, list) else low_bound[i]
|
196
|
+
|
197
|
+
low = low if _low_bound is None else max(low, _low_bound)
|
198
|
+
|
199
|
+
|
200
|
+
|
201
|
+
high = highs[i]
|
202
|
+
|
203
|
+
if high_bound is not None:
|
204
|
+
|
205
|
+
_high_bound = high_bound if not isinstance(high_bound, list) else high_bound[i]
|
206
|
+
|
207
|
+
high = high if _high_bound is None else min(high, _high_bound)
|
208
|
+
|
209
|
+
|
210
|
+
|
211
|
+
yield i, low, high
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
def observation_to_state(self, observation,target_env):
|
216
|
+
|
217
|
+
if target_env == "CartPole-v0":
|
218
|
+
|
219
|
+
state = 0
|
220
|
+
|
221
|
+
# caution: bin_size over 10 will not work accurately
|
222
|
+
|
223
|
+
unit = max(self._bin_sizes)
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
for d, o in enumerate(observation.flatten()):
|
228
|
+
|
229
|
+
state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d) # bin_size numeral system
|
230
|
+
|
231
|
+
else:
|
232
|
+
|
233
|
+
|
234
|
+
|
235
|
+
state = 0
|
236
|
+
|
237
|
+
unit = max(self._bin_sizes)
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
if observation is None:
|
242
|
+
|
243
|
+
pass
|
244
|
+
|
245
|
+
else:
|
246
|
+
|
247
|
+
for d, o in enumerate(np.asarray(observation).flatten()):
|
248
|
+
|
249
|
+
state = state + np.digitize(o, self._dimension_bins[d]) * pow(unit, d) # bin_size numeral system
|
250
|
+
|
57
|
-
|
251
|
+
return state
|
252
|
+
|
253
|
+
|
254
|
+
|
58
|
-
|
255
|
+
def values(self, observation,target_env):
|
256
|
+
|
257
|
+
state = self.observation_to_state(observation,target_env)
|
258
|
+
|
59
|
-
|
259
|
+
return self.table[state]
|
260
|
+
|
261
|
+
|
262
|
+
|
60
|
-
|
263
|
+
####
|
264
|
+
|
61
|
-
im
|
265
|
+
# Copied from "agent.py"
|
62
266
|
|
63
267
|
import random
|
64
268
|
|
65
|
-
|
66
|
-
|
67
|
-
class
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
def __init__(self):
|
72
|
-
|
73
|
-
self.
|
74
|
-
|
75
|
-
self.
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
|
90
|
-
|
91
|
-
self.hunter_Position_X=random.randint(0,5)
|
92
|
-
|
93
|
-
self.hunter_Position_Y=random.randint(0,5)
|
94
|
-
|
95
|
-
#print(self.hunter_Position_X,self.hunter_Position_Y)
|
96
|
-
|
97
|
-
#逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。
|
98
|
-
|
99
|
-
self.game_count=0
|
100
|
-
|
101
|
-
#1ゲームで行動できる上限を設定をしている。今回は10回とする。
|
102
|
-
|
103
|
-
self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
|
104
|
-
|
105
|
-
# print("初期の距離は"+str(self.initial_distance))
|
106
|
-
|
107
|
-
#鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
|
108
|
-
|
109
|
-
self.lists = []
|
110
|
-
|
111
|
-
#距離を格納するリスト。
|
112
|
-
|
113
|
-
self.current_hunter_profit_lists = []
|
114
|
-
|
115
|
-
#鬼の報酬を各ステップごとに加える。
|
116
|
-
|
117
|
-
self.current_fugitive_profit_lists = []
|
118
|
-
|
119
|
-
#逃亡者の報酬を各ステップごとに加える。
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
self.action_space = gym.spaces.Discrete(4)
|
124
|
-
|
125
|
-
low = np.array([0, 0, 0, 0])
|
126
|
-
|
127
|
-
high = np.array([5, 5, 5, 5])
|
128
|
-
|
129
|
-
self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
|
130
|
-
|
131
|
-
#逃走エリアを定義している。
|
132
|
-
|
133
|
-
self.hunter_reward=0
|
134
|
-
|
135
|
-
self.fugitive_reward=0
|
136
|
-
|
137
|
-
#鬼と逃亡者の報酬を0で初期化している。
|
138
|
-
|
139
|
-
self.learn_count=0
|
140
|
-
|
141
|
-
#学習回数を10000回と制限。
|
142
|
-
|
143
|
-
self.lists.append(self.initial_distance)
|
144
|
-
|
145
|
-
#開始時の距離を格納する。
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
def step(self,action):
|
150
|
-
|
151
|
-
self.game_count += 1
|
152
|
-
|
153
|
-
self.learn_count += 1
|
154
|
-
|
155
|
-
# print("学習回数は",self.learn_count)
|
156
|
-
|
157
|
-
if action == 0 and self.hunter_Position_X < 5:
|
158
|
-
|
159
|
-
self.hunter_Position_X += 1
|
160
|
-
|
161
|
-
if action == 1 and self.hunter_Position_X > 0:
|
162
|
-
|
163
|
-
self.hunter_Position_X -= 1
|
164
|
-
|
165
|
-
if action == 2 and self.hunter_Position_Y < 5:
|
166
|
-
|
167
|
-
self.hunter_Position_Y += 1
|
168
|
-
|
169
|
-
if action == 3 and self.hunter_Position_Y > 0:
|
170
|
-
|
171
|
-
self.hunter_Position_Y -= 1
|
172
|
-
|
173
|
-
# print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
|
174
|
-
|
175
|
-
# print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
#鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
if action == 0 and self.hunter_Position_X == 5:
|
184
|
-
|
185
|
-
pass
|
186
|
-
|
187
|
-
if action == 1 and self.hunter_Position_X == 0:
|
188
|
-
|
189
|
-
pass
|
190
|
-
|
191
|
-
if action == 2 and self.hunter_Position_Y == 5:
|
192
|
-
|
193
|
-
pass
|
194
|
-
|
195
|
-
if action == 3 and self.hunter_Position_Y == 0:
|
196
|
-
|
197
|
-
pass
|
198
|
-
|
199
|
-
#例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
|
200
|
-
|
201
|
-
# time.sleep(0.01)
|
202
|
-
|
203
|
-
#間隔を0.01秒とする。
|
204
|
-
|
205
|
-
self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
|
206
|
-
|
207
|
-
self.lists.append(self.d)
|
208
|
-
|
209
|
-
#距離を格納
|
210
|
-
|
211
|
-
self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
|
212
|
-
|
213
|
-
#鬼と逃亡者の位置を毎回格納する。
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
hunter_reward,fugitive_reward=self.calc_profit()
|
218
|
-
|
219
|
-
#報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
|
220
|
-
|
221
|
-
# print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
|
222
|
-
|
223
|
-
print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
is_end = self.reset()
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
# print("return値は",np.array(self.observation),hunter_reward,action)
|
232
|
-
|
233
|
-
return np.array(self.observation),hunter_reward,action,{}
|
234
|
-
|
235
|
-
#値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
#if action == 4:
|
240
|
-
|
241
|
-
#self.fugitive_Position_X += 1
|
242
|
-
|
243
|
-
#if action == 5:
|
244
|
-
|
245
|
-
#self.fugitive_Position_X -= 1
|
246
|
-
|
247
|
-
#if action == 6:
|
248
|
-
|
249
|
-
#self.fugitive_Position_Y += 1
|
250
|
-
|
251
|
-
#if action == 7:
|
252
|
-
|
253
|
-
#self.fugitive_Position_Y -= 1
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
def reset_position(self):
|
258
|
-
|
259
|
-
hunter_Position_X=random.randint(0,5)
|
260
|
-
|
261
|
-
hunter_Position_Y=random.randint(0,5)
|
262
|
-
|
263
|
-
fugitive_Position_X=random.randint(0,5)
|
264
|
-
|
265
|
-
fugitive_Position_Y=random.randint(0,5)
|
266
|
-
|
267
|
-
while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
|
268
|
-
|
269
|
-
hunter_Position_X=random.randint(0,5)
|
270
|
-
|
271
|
-
hunter_Position_Y=random.randint(0,5)
|
272
|
-
|
273
|
-
print("リセットされました!!!")
|
274
|
-
|
275
|
-
print()
|
276
|
-
|
277
|
-
return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
|
278
|
-
|
279
|
-
#返り値を残しておく。
|
280
|
-
|
281
|
-
#1ゲームの終了条件を満たしたときに行う指示を記載。
|
282
|
-
|
283
|
-
#鬼、逃亡者をランダムに配置する。
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
|
288
|
-
|
289
|
-
distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
|
290
|
-
|
291
|
-
return distance
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
def calc_profit(self):
|
296
|
-
|
297
|
-
i= self.game_count
|
298
|
-
|
299
|
-
if i <= 10 and self.lists[i] == 0:
|
300
|
-
|
301
|
-
self.hunter_reward += 1
|
302
|
-
|
303
|
-
self.fugitive_reward -= 1
|
304
|
-
|
305
|
-
current_hunter_reward = 1
|
306
|
-
|
307
|
-
current_fugitive_reward = -1
|
308
|
-
|
309
|
-
self.current_hunter_profit_lists.append(current_hunter_reward)
|
310
|
-
|
311
|
-
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
312
|
-
|
313
|
-
# print("確保成功!!!")
|
314
|
-
|
315
|
-
self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
|
316
|
-
|
317
|
-
self.game_count = 0
|
318
|
-
|
319
|
-
self.lists = []
|
320
|
-
|
321
|
-
self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
|
322
|
-
|
323
|
-
#10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
elif i == 10 and (0 not in self.lists):
|
328
|
-
|
329
|
-
self.hunter_reward -= 1
|
330
|
-
|
331
|
-
self.fugitive_reward += 1
|
332
|
-
|
333
|
-
current_hunter_reward = -1
|
334
|
-
|
335
|
-
current_fugitive_reward = 1
|
336
|
-
|
337
|
-
self.current_hunter_profit_lists.append(current_hunter_reward)
|
338
|
-
|
339
|
-
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
340
|
-
|
341
|
-
# print("確保失敗!!!")
|
342
|
-
|
343
|
-
self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
|
344
|
-
|
345
|
-
self.game_count = 0
|
346
|
-
|
347
|
-
self.lists = []
|
348
|
-
|
349
|
-
self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
|
350
|
-
|
351
|
-
#10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
elif i <= 10 and self.lists[i-1] < self.lists[i]:
|
356
|
-
|
357
|
-
self.hunter_reward -= 1
|
358
|
-
|
359
|
-
self.fugitive_reward += 1
|
360
|
-
|
361
|
-
current_hunter_reward = -1
|
362
|
-
|
363
|
-
current_fugitive_reward = 1
|
364
|
-
|
365
|
-
self.current_hunter_profit_lists.append(current_hunter_reward)
|
366
|
-
|
367
|
-
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
368
|
-
|
369
|
-
# print("逃げられてるよ!!!")
|
370
|
-
|
371
|
-
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
elif i <= 10 and self.lists[i-1] > self.lists[i]:
|
376
|
-
|
377
|
-
self.hunter_reward += 1
|
378
|
-
|
379
|
-
self.fugitive_reward -= 1
|
380
|
-
|
381
|
-
current_hunter_reward = 1
|
382
|
-
|
383
|
-
current_fugitive_reward = -1
|
384
|
-
|
385
|
-
self.current_hunter_profit_lists.append(current_hunter_reward)
|
386
|
-
|
387
|
-
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
388
|
-
|
389
|
-
# print("距離を詰めてるね!!!")
|
390
|
-
|
391
|
-
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
elif i <= 10 and self.lists[i-1] == self.lists[i]:
|
396
|
-
|
397
|
-
self.hunter_reward += 0
|
398
|
-
|
399
|
-
self.fugitive_reward += 0
|
400
|
-
|
401
|
-
current_hunter_reward = 0
|
402
|
-
|
403
|
-
current_fugitive_reward = 0
|
404
|
-
|
405
|
-
self.current_hunter_profit_lists.append(current_hunter_reward)
|
406
|
-
|
407
|
-
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
408
|
-
|
409
|
-
# print("距離が変わってないよ!!!")
|
410
|
-
|
411
|
-
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
412
|
-
|
413
|
-
|
269
|
+
import numpy as np
|
270
|
+
|
271
|
+
class Agent():
|
272
|
+
|
273
|
+
|
274
|
+
|
275
|
+
def __init__(self, q, epsilon=0.05):
|
276
|
+
|
277
|
+
self.q = q
|
278
|
+
|
279
|
+
self.epsilon = epsilon
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
def act(self, observation,target_env):
|
284
|
+
|
285
|
+
# your code here
|
286
|
+
|
287
|
+
action = -1
|
288
|
+
|
289
|
+
if np.random.random() < self.epsilon:
|
290
|
+
|
291
|
+
action = np.random.choice(self.q.n_actions)
|
414
292
|
|
415
293
|
else:
|
416
294
|
|
417
|
-
pass
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
return
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
el
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
295
|
+
action = np.argmax(self.q.values(observation,target_env))
|
296
|
+
|
297
|
+
|
298
|
+
|
299
|
+
return action
|
300
|
+
|
301
|
+
|
302
|
+
|
303
|
+
####
|
304
|
+
|
305
|
+
# Copied from "trainer.py"
|
306
|
+
|
307
|
+
from collections import deque
|
308
|
+
|
309
|
+
|
310
|
+
|
311
|
+
|
312
|
+
|
313
|
+
class Trainer():
|
314
|
+
|
315
|
+
|
316
|
+
|
317
|
+
def __init__(self, agent,target_env, gamma=0.95, learning_rate=0.1, learning_rate_decay=None, epsilon=0.05, epsilon_decay=None, max_step=-1):
|
318
|
+
|
319
|
+
self.agent = agent
|
320
|
+
|
321
|
+
self.target_env = target_env
|
322
|
+
|
323
|
+
self.gamma = gamma
|
324
|
+
|
325
|
+
self.learning_rate = learning_rate
|
326
|
+
|
327
|
+
self.learning_rate_decay = learning_rate_decay
|
328
|
+
|
329
|
+
self.epsilon = epsilon
|
330
|
+
|
331
|
+
self.epsilon_decay = epsilon_decay
|
332
|
+
|
333
|
+
self.max_step = max_step
|
334
|
+
|
335
|
+
|
336
|
+
|
337
|
+
def train(self, env, episode_count, render=False):
|
338
|
+
|
339
|
+
default_epsilon = self.agent.epsilon
|
340
|
+
|
341
|
+
self.agent.epsilon = self.epsilon
|
342
|
+
|
343
|
+
values = []
|
344
|
+
|
345
|
+
steps = deque(maxlen=100)
|
346
|
+
|
347
|
+
lr = self.learning_rate
|
348
|
+
|
349
|
+
for i in range(episode_count):
|
350
|
+
|
351
|
+
obs = env.reset()
|
352
|
+
|
353
|
+
step = 0
|
354
|
+
|
355
|
+
done = False
|
356
|
+
|
357
|
+
while not done:
|
358
|
+
|
359
|
+
if render:
|
360
|
+
|
361
|
+
if self.target_env == "myenv-v1":
|
362
|
+
|
363
|
+
print("Not supported yet.")
|
364
|
+
|
365
|
+
else:
|
366
|
+
|
367
|
+
env.render()
|
368
|
+
|
369
|
+
|
370
|
+
|
371
|
+
action = self.agent.act(obs,self.target_env)
|
372
|
+
|
373
|
+
next_obs, reward, done, _ = env.step(action)
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
state = self.agent.q.observation_to_state(obs,self.target_env)
|
378
|
+
|
379
|
+
future = 0 if done else np.max(self.agent.q.values(next_obs,self.target_env))
|
380
|
+
|
381
|
+
value = self.agent.q.table[state][action]
|
382
|
+
|
383
|
+
self.agent.q.table[state][action] += lr * (reward + self.gamma * future - value)
|
384
|
+
|
385
|
+
|
386
|
+
|
387
|
+
obs = next_obs
|
388
|
+
|
389
|
+
values.append(value)
|
390
|
+
|
391
|
+
step += 1
|
392
|
+
|
393
|
+
if self.max_step > 0 and step > self.max_step:
|
394
|
+
|
395
|
+
done = True
|
396
|
+
|
397
|
+
else:
|
398
|
+
|
399
|
+
mean = np.mean(values)
|
400
|
+
|
401
|
+
steps.append(step)
|
402
|
+
|
403
|
+
mean_step = np.mean(steps)
|
404
|
+
|
405
|
+
print("Episode {}: {}steps(avg{}). epsilon={:.3f}, lr={:.3f}, mean q value={:.2f}".format(
|
406
|
+
|
407
|
+
i, step, mean_step, self.agent.epsilon, lr, mean)
|
408
|
+
|
409
|
+
)
|
410
|
+
|
411
|
+
|
412
|
+
|
413
|
+
if self.epsilon_decay is not None:
|
414
|
+
|
415
|
+
self.agent.epsilon = self.epsilon_decay(self.agent.epsilon, i)
|
416
|
+
|
417
|
+
if self.learning_rate_decay is not None:
|
418
|
+
|
419
|
+
lr = self.learning_rate_decay(lr, i)
|
420
|
+
|
421
|
+
|
422
|
+
|
423
|
+
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
|
428
|
+
|
429
|
+
def main(episodes, render):
|
430
|
+
|
431
|
+
env = gym.make(COMMON.target_env)
|
432
|
+
|
433
|
+
|
434
|
+
|
435
|
+
q = Q(
|
436
|
+
|
437
|
+
env.action_space.n,
|
438
|
+
|
439
|
+
env.observation_space,
|
440
|
+
|
441
|
+
bin_size= COMMON.bins_size,
|
442
|
+
|
443
|
+
low_bound= COMMON.low_bound,
|
444
|
+
|
445
|
+
high_bound= COMMON.high_bound
|
446
|
+
|
447
|
+
)
|
448
|
+
|
449
|
+
agent = Agent(q, epsilon=0.05)
|
450
|
+
|
451
|
+
|
452
|
+
|
453
|
+
learning_decay = lambda lr, t: max(0.1, min(0.5, 1.0 - math.log10((t + 1) / 25)))
|
454
|
+
|
455
|
+
epsilon_decay = lambda eps, t: max(0.01, min(1.0, 1.0 - math.log10((t + 1) / 25)))
|
456
|
+
|
457
|
+
trainer = Trainer(
|
458
|
+
|
459
|
+
agent,
|
460
|
+
|
461
|
+
target_env = COMMON.target_env,
|
462
|
+
|
463
|
+
gamma=0.99,
|
464
|
+
|
465
|
+
learning_rate=0.5, learning_rate_decay=learning_decay,
|
466
|
+
|
467
|
+
epsilon=1.0, epsilon_decay=epsilon_decay,
|
468
|
+
|
469
|
+
max_step=250)
|
470
|
+
|
471
|
+
|
472
|
+
|
473
|
+
|
474
|
+
|
475
|
+
trainer.train(env, episode_count=episodes, render=render)
|
476
|
+
|
477
|
+
|
478
|
+
|
479
|
+
|
480
|
+
|
481
|
+
|
482
|
+
|
483
|
+
if __name__ == "__main__":
|
484
|
+
|
485
|
+
parser = argparse.ArgumentParser(description="train & run cartpole ")
|
486
|
+
|
487
|
+
parser.add_argument("--episode", type=int, default=1000, help="episode to train")
|
488
|
+
|
489
|
+
parser.add_argument("--render", action="store_true", help="render the screen")
|
490
|
+
|
491
|
+
|
492
|
+
|
493
|
+
args = parser.parse_args()
|
494
|
+
|
495
|
+
|
496
|
+
|
497
|
+
main(args.episode, args.render)
|
454
498
|
|
455
499
|
|
456
500
|
|
1
実際のコード添付
test
CHANGED
@@ -44,508 +44,414 @@
|
|
44
44
|
|
45
45
|
|
46
46
|
|
47
|
-
|
47
|
+
envs.py
|
48
48
|
|
49
49
|
```Python3
|
50
50
|
|
51
|
-
# Original code was provided on https://github.com/icoxfog417/techcircle_openai_handson
|
52
|
-
|
53
|
-
# that provided as MIT license by Takahiro Kubo.
|
54
|
-
|
55
|
-
# This was modified from "handson3.py".
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
import
|
51
|
+
import gym.spaces
|
52
|
+
|
60
|
-
|
53
|
+
import numpy as np
|
54
|
+
|
61
|
-
import s
|
55
|
+
import pandas
|
62
|
-
|
63
|
-
|
64
56
|
|
65
57
|
import math
|
66
58
|
|
67
|
-
import a
|
68
|
-
|
69
|
-
import
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
f
|
80
|
-
|
81
|
-
i
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
#
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
t
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
if
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
l
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
el
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
l
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
#
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
self.n_
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
self.
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
#
|
150
|
-
|
151
|
-
self.
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
def
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
f
|
178
|
-
|
179
|
-
lo
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
59
|
+
import matplotlib.pyplot as plt
|
60
|
+
|
61
|
+
import time
|
62
|
+
|
63
|
+
import random
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
class Game(gym.core.Env):
|
68
|
+
|
69
|
+
#初期条件や各種変数の初期格納する。
|
70
|
+
|
71
|
+
def __init__(self):
|
72
|
+
|
73
|
+
self.hunter_Position_X=random.randint(0,5)
|
74
|
+
|
75
|
+
self.hunter_Position_Y=random.randint(0,5)
|
76
|
+
|
77
|
+
# print("鬼の初期位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
|
78
|
+
|
79
|
+
#selfでグローバル変数化している。鬼のx,y座標をランダムに配置。
|
80
|
+
|
81
|
+
self.fugitive_Position_X=random.randint(0,5)
|
82
|
+
|
83
|
+
self.fugitive_Position_Y=random.randint(0,5)
|
84
|
+
|
85
|
+
# print("逃亡者の初期位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
|
86
|
+
|
87
|
+
#selfでグローバル変数化している。逃亡者のx,y座標をランダムに配置。fugitiveは逃亡者という意味。
|
88
|
+
|
89
|
+
while self.hunter_Position_X == self.fugitive_Position_X and self.hunter_Position_Y == self.fugitive_Position_Y:
|
90
|
+
|
91
|
+
self.hunter_Position_X=random.randint(0,5)
|
92
|
+
|
93
|
+
self.hunter_Position_Y=random.randint(0,5)
|
94
|
+
|
95
|
+
#print(self.hunter_Position_X,self.hunter_Position_Y)
|
96
|
+
|
97
|
+
#逃亡者と鬼の位置が完全に一致している場合、鬼の初期位置を再度決める。
|
98
|
+
|
99
|
+
self.game_count=0
|
100
|
+
|
101
|
+
#1ゲームで行動できる上限を設定をしている。今回は10回とする。
|
102
|
+
|
103
|
+
self.initial_distance=int(100*math.sqrt((self.hunter_Position_X-self.fugitive_Position_X)**2+(self.hunter_Position_Y-self.fugitive_Position_Y)**2))
|
104
|
+
|
105
|
+
# print("初期の距離は"+str(self.initial_distance))
|
106
|
+
|
107
|
+
#鬼と逃亡者の距離を定義する。ただの三平方の定理。自然数で処理するため100倍した。
|
108
|
+
|
109
|
+
self.lists = []
|
110
|
+
|
111
|
+
#距離を格納するリスト。
|
112
|
+
|
113
|
+
self.current_hunter_profit_lists = []
|
114
|
+
|
115
|
+
#鬼の報酬を各ステップごとに加える。
|
116
|
+
|
117
|
+
self.current_fugitive_profit_lists = []
|
118
|
+
|
119
|
+
#逃亡者の報酬を各ステップごとに加える。
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
self.action_space = gym.spaces.Discrete(4)
|
124
|
+
|
125
|
+
low = np.array([0, 0, 0, 0])
|
126
|
+
|
127
|
+
high = np.array([5, 5, 5, 5])
|
128
|
+
|
129
|
+
self.observation_space = gym.spaces.Box(low, high, dtype=np.int64)
|
130
|
+
|
131
|
+
#逃走エリアを定義している。
|
132
|
+
|
133
|
+
self.hunter_reward=0
|
134
|
+
|
135
|
+
self.fugitive_reward=0
|
136
|
+
|
137
|
+
#鬼と逃亡者の報酬を0で初期化している。
|
138
|
+
|
139
|
+
self.learn_count=0
|
140
|
+
|
141
|
+
#学習回数を10000回と制限。
|
142
|
+
|
143
|
+
self.lists.append(self.initial_distance)
|
144
|
+
|
145
|
+
#開始時の距離を格納する。
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
def step(self,action):
|
150
|
+
|
151
|
+
self.game_count += 1
|
152
|
+
|
153
|
+
self.learn_count += 1
|
154
|
+
|
155
|
+
# print("学習回数は",self.learn_count)
|
156
|
+
|
157
|
+
if action == 0 and self.hunter_Position_X < 5:
|
158
|
+
|
159
|
+
self.hunter_Position_X += 1
|
160
|
+
|
161
|
+
if action == 1 and self.hunter_Position_X > 0:
|
162
|
+
|
163
|
+
self.hunter_Position_X -= 1
|
164
|
+
|
165
|
+
if action == 2 and self.hunter_Position_Y < 5:
|
166
|
+
|
167
|
+
self.hunter_Position_Y += 1
|
168
|
+
|
169
|
+
if action == 3 and self.hunter_Position_Y > 0:
|
170
|
+
|
171
|
+
self.hunter_Position_Y -= 1
|
172
|
+
|
173
|
+
# print("鬼の位置は"+str(self.hunter_Position_X),self.hunter_Position_Y)
|
174
|
+
|
175
|
+
# print("逃亡者の位置は"+str(self.fugitive_Position_X),self.fugitive_Position_Y)
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
#鬼の行動を4つ設け選択できるようにする。上下左右に移動できる。
|
180
|
+
|
181
|
+
|
182
|
+
|
183
|
+
if action == 0 and self.hunter_Position_X == 5:
|
184
|
+
|
185
|
+
pass
|
186
|
+
|
187
|
+
if action == 1 and self.hunter_Position_X == 0:
|
188
|
+
|
189
|
+
pass
|
190
|
+
|
191
|
+
if action == 2 and self.hunter_Position_Y == 5:
|
192
|
+
|
193
|
+
pass
|
194
|
+
|
195
|
+
if action == 3 and self.hunter_Position_Y == 0:
|
196
|
+
|
197
|
+
pass
|
198
|
+
|
199
|
+
#例外処理としてエリア外に出る行為は1ターン無駄に消費する事とする。andは&と書くと想定外の動作となった為使用禁止。
|
200
|
+
|
201
|
+
# time.sleep(0.01)
|
202
|
+
|
203
|
+
#間隔を0.01秒とする。
|
204
|
+
|
205
|
+
self.d = self.cal_distance(h_X = self.hunter_Position_X , h_Y = self.hunter_Position_Y , f_X = self.fugitive_Position_X , f_Y = self.fugitive_Position_Y)
|
206
|
+
|
207
|
+
self.lists.append(self.d)
|
208
|
+
|
209
|
+
#距離を格納
|
210
|
+
|
211
|
+
self.observation = (self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y)
|
212
|
+
|
213
|
+
#鬼と逃亡者の位置を毎回格納する。
|
214
|
+
|
215
|
+
|
216
|
+
|
217
|
+
hunter_reward,fugitive_reward=self.calc_profit()
|
218
|
+
|
219
|
+
#報酬はcalc_profitcalc_profit関数で計算するのでそちらを参照。
|
220
|
+
|
221
|
+
# print("鬼の報酬は"+str(hunter_reward),"逃亡者の報酬は"+str(fugitive_reward))
|
222
|
+
|
223
|
+
print("鬼の総合報酬は",sum(self.current_hunter_profit_lists),"逃亡者の総合報酬は",sum(self.current_fugitive_profit_lists))
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
is_end = self.reset()
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
# print("return値は",np.array(self.observation),hunter_reward,action)
|
232
|
+
|
233
|
+
return np.array(self.observation),hunter_reward,action,{}
|
234
|
+
|
235
|
+
#値は4つ必要。学習が良くない時は上記の変数値を変える必要あり。行動を決める要素を入れる。
|
236
|
+
|
237
|
+
|
238
|
+
|
239
|
+
#if action == 4:
|
240
|
+
|
241
|
+
#self.fugitive_Position_X += 1
|
242
|
+
|
243
|
+
#if action == 5:
|
244
|
+
|
245
|
+
#self.fugitive_Position_X -= 1
|
246
|
+
|
247
|
+
#if action == 6:
|
248
|
+
|
249
|
+
#self.fugitive_Position_Y += 1
|
250
|
+
|
251
|
+
#if action == 7:
|
252
|
+
|
253
|
+
#self.fugitive_Position_Y -= 1
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
def reset_position(self):
|
258
|
+
|
259
|
+
hunter_Position_X=random.randint(0,5)
|
260
|
+
|
261
|
+
hunter_Position_Y=random.randint(0,5)
|
262
|
+
|
263
|
+
fugitive_Position_X=random.randint(0,5)
|
264
|
+
|
265
|
+
fugitive_Position_Y=random.randint(0,5)
|
266
|
+
|
267
|
+
while hunter_Position_X == fugitive_Position_X and hunter_Position_Y == fugitive_Position_Y:
|
268
|
+
|
269
|
+
hunter_Position_X=random.randint(0,5)
|
270
|
+
|
271
|
+
hunter_Position_Y=random.randint(0,5)
|
272
|
+
|
273
|
+
print("リセットされました!!!")
|
274
|
+
|
275
|
+
print()
|
276
|
+
|
277
|
+
return hunter_Position_X,hunter_Position_Y,fugitive_Position_X,fugitive_Position_Y
|
278
|
+
|
279
|
+
#返り値を残しておく。
|
280
|
+
|
281
|
+
#1ゲームの終了条件を満たしたときに行う指示を記載。
|
282
|
+
|
283
|
+
#鬼、逃亡者をランダムに配置する。
|
284
|
+
|
285
|
+
|
286
|
+
|
287
|
+
def cal_distance(self , h_X , h_Y ,f_X ,f_Y):
|
288
|
+
|
289
|
+
distance = int(100*math.sqrt((h_X-f_X)**2 +(h_Y-f_Y)**2))
|
290
|
+
|
291
|
+
return distance
|
292
|
+
|
293
|
+
|
294
|
+
|
295
|
+
def calc_profit(self):
|
296
|
+
|
297
|
+
i= self.game_count
|
298
|
+
|
299
|
+
if i <= 10 and self.lists[i] == 0:
|
300
|
+
|
301
|
+
self.hunter_reward += 1
|
302
|
+
|
303
|
+
self.fugitive_reward -= 1
|
304
|
+
|
305
|
+
current_hunter_reward = 1
|
306
|
+
|
307
|
+
current_fugitive_reward = -1
|
308
|
+
|
309
|
+
self.current_hunter_profit_lists.append(current_hunter_reward)
|
310
|
+
|
311
|
+
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
312
|
+
|
313
|
+
# print("確保成功!!!")
|
314
|
+
|
315
|
+
self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
|
316
|
+
|
317
|
+
self.game_count = 0
|
318
|
+
|
319
|
+
self.lists = []
|
320
|
+
|
321
|
+
self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
|
322
|
+
|
323
|
+
#10回の行動以下で鬼が確保できた時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
|
324
|
+
|
325
|
+
|
326
|
+
|
327
|
+
elif i == 10 and (0 not in self.lists):
|
328
|
+
|
329
|
+
self.hunter_reward -= 1
|
330
|
+
|
331
|
+
self.fugitive_reward += 1
|
332
|
+
|
333
|
+
current_hunter_reward = -1
|
334
|
+
|
335
|
+
current_fugitive_reward = 1
|
336
|
+
|
337
|
+
self.current_hunter_profit_lists.append(current_hunter_reward)
|
338
|
+
|
339
|
+
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
340
|
+
|
341
|
+
# print("確保失敗!!!")
|
342
|
+
|
343
|
+
self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y = self.reset_position()
|
344
|
+
|
345
|
+
self.game_count = 0
|
346
|
+
|
347
|
+
self.lists = []
|
348
|
+
|
349
|
+
self.lists.append(self.cal_distance(self.hunter_Position_X,self.hunter_Position_Y,self.fugitive_Position_X,self.fugitive_Position_Y))
|
350
|
+
|
351
|
+
#10回の行動以下で鬼が確保出来なかった時の報酬を定義している。また距離のリストやゲームカウントを初期化している。
|
352
|
+
|
353
|
+
|
354
|
+
|
355
|
+
elif i <= 10 and self.lists[i-1] < self.lists[i]:
|
356
|
+
|
357
|
+
self.hunter_reward -= 1
|
358
|
+
|
359
|
+
self.fugitive_reward += 1
|
360
|
+
|
361
|
+
current_hunter_reward = -1
|
362
|
+
|
363
|
+
current_fugitive_reward = 1
|
364
|
+
|
365
|
+
self.current_hunter_profit_lists.append(current_hunter_reward)
|
366
|
+
|
367
|
+
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
368
|
+
|
369
|
+
# print("逃げられてるよ!!!")
|
370
|
+
|
371
|
+
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
372
|
+
|
373
|
+
|
374
|
+
|
375
|
+
elif i <= 10 and self.lists[i-1] > self.lists[i]:
|
376
|
+
|
377
|
+
self.hunter_reward += 1
|
378
|
+
|
379
|
+
self.fugitive_reward -= 1
|
380
|
+
|
381
|
+
current_hunter_reward = 1
|
382
|
+
|
383
|
+
current_fugitive_reward = -1
|
384
|
+
|
385
|
+
self.current_hunter_profit_lists.append(current_hunter_reward)
|
386
|
+
|
387
|
+
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
388
|
+
|
389
|
+
# print("距離を詰めてるね!!!")
|
390
|
+
|
391
|
+
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
elif i <= 10 and self.lists[i-1] == self.lists[i]:
|
396
|
+
|
397
|
+
self.hunter_reward += 0
|
398
|
+
|
399
|
+
self.fugitive_reward += 0
|
400
|
+
|
401
|
+
current_hunter_reward = 0
|
402
|
+
|
403
|
+
current_fugitive_reward = 0
|
404
|
+
|
405
|
+
self.current_hunter_profit_lists.append(current_hunter_reward)
|
406
|
+
|
407
|
+
self.current_fugitive_profit_lists.append(current_fugitive_reward)
|
408
|
+
|
409
|
+
# print("距離が変わってないよ!!!")
|
410
|
+
|
411
|
+
#前回ステップと今回のステップで距離を比較して報酬を定義している。
|
412
|
+
|
413
|
+
|
218
414
|
|
219
415
|
else:
|
220
416
|
|
221
|
-
|
222
|
-
|
223
|
-
state = 0
|
224
|
-
|
225
|
-
unit = max(self._bin_sizes)
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
if observation is None:
|
230
|
-
|
231
|
-
|
417
|
+
pass
|
232
|
-
|
233
|
-
|
418
|
+
|
234
|
-
|
419
|
+
|
420
|
+
|
235
|
-
|
421
|
+
return current_hunter_reward,current_fugitive_reward
|
236
|
-
|
237
|
-
|
422
|
+
|
238
|
-
|
239
|
-
|
423
|
+
|
240
|
-
|
241
|
-
|
242
|
-
|
424
|
+
|
243
|
-
def
|
425
|
+
#def Linear_function:
|
244
|
-
|
426
|
+
|
245
|
-
|
427
|
+
#Y_intercept_1 = self.hunter_Position_Y - math.sqrt(3)*self.hunter_Position_X
|
428
|
+
|
246
|
-
|
429
|
+
#Y_intercept_2 = self.hunter_Position_Y + math.sqrt(3)*self.hunter_Position_X
|
430
|
+
|
431
|
+
#Y_intercept_3 = self.hunter_Position_Y - (1/math.sqrt(3))*self.hunter_Position_X
|
432
|
+
|
433
|
+
#Y_intercept_4 = self.hunter_Position_Y + (1/math.sqrt(3))*self.hunter_Position_X
|
434
|
+
|
247
|
-
r
|
435
|
+
#Y = math.sqrt(3)X + b
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
436
|
+
|
252
|
-
|
253
|
-
|
437
|
+
|
254
|
-
|
255
|
-
|
438
|
+
|
256
|
-
|
257
|
-
|
439
|
+
#プログラミングは書いた通りにしか動かない。
|
258
|
-
|
440
|
+
|
441
|
+
|
442
|
+
|
259
|
-
|
443
|
+
def reset(self):
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
444
|
+
|
264
|
-
|
265
|
-
self.q = q
|
266
|
-
|
267
|
-
self.e
|
445
|
+
if self.learn_count == 0:
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
446
|
+
|
272
|
-
|
273
|
-
# your code here
|
274
|
-
|
275
|
-
|
447
|
+
is_end = True
|
276
|
-
|
277
|
-
if np.random.random() < self.epsilon:
|
278
|
-
|
279
|
-
action = np.random.choice(self.q.n_actions)
|
280
448
|
|
281
449
|
else:
|
282
450
|
|
283
|
-
action = np.argmax(self.q.values(observation,target_env))
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
return action
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
####
|
292
|
-
|
293
|
-
# Copied from "trainer.py"
|
294
|
-
|
295
|
-
from collections import deque
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
class Trainer():
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
def __init__(self, agent,target_env, gamma=0.95, learning_rate=0.1, learning_rate_decay=None, epsilon=0.05, epsilon_decay=None, max_step=-1):
|
306
|
-
|
307
|
-
self.agent = agent
|
308
|
-
|
309
|
-
self.gamma = gamma
|
310
|
-
|
311
|
-
self.learning_rate = learning_rate
|
312
|
-
|
313
|
-
self.learning_rate_decay = learning_rate_decay
|
314
|
-
|
315
|
-
self.epsilon = epsilon
|
316
|
-
|
317
|
-
self.epsilon_decay = epsilon_decay
|
318
|
-
|
319
|
-
self.max_step = max_step
|
320
|
-
|
321
|
-
self.target_env = target_env
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
def train(self, env, episode_count, render=False):
|
326
|
-
|
327
|
-
default_epsilon = self.agent.epsilon
|
328
|
-
|
329
|
-
self.agent.epsilon = self.epsilon
|
330
|
-
|
331
|
-
values = []
|
332
|
-
|
333
|
-
steps = deque(maxlen=100)
|
334
|
-
|
335
|
-
lr = self.learning_rate
|
336
|
-
|
337
|
-
for i in range(episode_count):
|
338
|
-
|
339
|
-
obs = env.reset()
|
340
|
-
|
341
|
-
step = 0
|
342
|
-
|
343
|
-
d
|
451
|
+
is_end = False
|
344
|
-
|
345
|
-
|
452
|
+
|
346
|
-
|
347
|
-
if render:
|
348
|
-
|
349
|
-
# env.render()
|
350
|
-
|
351
|
-
print("env.render() is not supported yet.")
|
352
|
-
|
353
|
-
action = self.agent.act(obs,self.target_env)
|
354
|
-
|
355
|
-
next_obs, reward, done, _ = env.step(action)
|
356
|
-
|
357
|
-
state = self.agent.q.observation_to_state(obs,self.target_env)
|
358
|
-
|
359
|
-
future = 0 if done else np.max(self.agent.q.values(next_obs,self.target_env))
|
360
|
-
|
361
|
-
value = self.agent.q.table[state][action]
|
362
|
-
|
363
|
-
self.agent.q.table[state][action] += lr * (reward + self.gamma * future - value)
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
obs = next_obs
|
368
|
-
|
369
|
-
values.append(value)
|
370
|
-
|
371
|
-
step += 1
|
372
|
-
|
373
|
-
if self.max_step > 0 and step > self.max_step:
|
374
|
-
|
375
|
-
done = True
|
376
|
-
|
377
|
-
else:
|
378
|
-
|
379
|
-
mean = np.mean(values)
|
380
|
-
|
381
|
-
steps.append(step)
|
382
|
-
|
383
|
-
mean_step = np.mean(steps)
|
384
|
-
|
385
|
-
print("Episode {}: {}steps(avg{}). epsilon={:.3f}, lr={:.3f}, mean q value={:.2f}".format(
|
386
|
-
|
387
|
-
i, step, mean_step, self.agent.epsilon, lr, mean)
|
388
|
-
|
389
|
-
)
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
if self.epsilon_decay is not None:
|
394
|
-
|
395
|
-
self.agent.epsilon = self.epsilon_decay(self.agent.epsilon, i)
|
396
|
-
|
397
|
-
if self.learning_rate_decay is not None:
|
398
|
-
|
399
|
-
|
453
|
+
#リセットする条件は学習回数を満了した時のみ。その際に報酬をリセットする。
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
def main(episodes, render):
|
406
|
-
|
407
|
-
env = gym.make(COMMON.target_env)
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
q = Q(
|
412
|
-
|
413
|
-
env.action_space.n,
|
414
|
-
|
415
|
-
env.observation_space,
|
416
|
-
|
417
|
-
bin_size= COMMON.bins_size,
|
418
|
-
|
419
|
-
low_bound= COMMON.low_bound,
|
420
|
-
|
421
|
-
high_bound= COMMON.high_bound
|
422
|
-
|
423
|
-
)
|
424
|
-
|
425
|
-
agent = Agent(q, epsilon=0.05)
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
learning_decay = lambda lr, t: max(0.1, min(0.5, 1.0 - math.log10((t + 1) / 25)))
|
430
|
-
|
431
|
-
epsilon_decay = lambda eps, t: max(0.01, min(1.0, 1.0 - math.log10((t + 1) / 25)))
|
432
|
-
|
433
|
-
trainer = Trainer(
|
434
|
-
|
435
|
-
agent,
|
436
|
-
|
437
|
-
target_env = COMMON.target_env,
|
438
|
-
|
439
|
-
gamma=0.99,
|
440
|
-
|
441
|
-
learning_rate=0.5, learning_rate_decay=learning_decay,
|
442
|
-
|
443
|
-
epsilon=1.0, epsilon_decay=epsilon_decay,
|
444
|
-
|
445
|
-
max_step=250)
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
trainer.train(env, episode_count=episodes, render=render)
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
if __name__ == "__main__":
|
460
|
-
|
461
|
-
parser = argparse.ArgumentParser(description="train & run cartpole ")
|
462
|
-
|
463
|
-
parser.add_argument("--episode", type=int, default=1000, help="episode to train")
|
464
|
-
|
465
|
-
parser.add_argument("--render", action="store_true", help="render the screen")
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
args = parser.parse_args()
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
main(args.episode, args.render)
|
474
|
-
|
475
|
-
|
476
454
|
|
477
455
|
|
478
456
|
|
479
457
|
```
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
鬼の総合報酬は -5 逃亡者の総合報酬は 5
|
488
|
-
|
489
|
-
Episode 295: 2steps(avg1.3). epsilon=0.010, lr=0.100, mean q value=0.04
|
490
|
-
|
491
|
-
学習回数は 440
|
492
|
-
|
493
|
-
鬼の総合報酬は -4 逃亡者の総合報酬は 4
|
494
|
-
|
495
|
-
学習回数は 441
|
496
|
-
|
497
|
-
鬼の総合報酬は -3 逃亡者の総合報酬は 3
|
498
|
-
|
499
|
-
学習回数は 442
|
500
|
-
|
501
|
-
リセットされました!!!
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
鬼の総合報酬は -2 逃亡者の総合報酬は 2
|
506
|
-
|
507
|
-
学習回数は 443
|
508
|
-
|
509
|
-
鬼の総合報酬は -1 逃亡者の総合報酬は 1
|
510
|
-
|
511
|
-
学習回数は 444
|
512
|
-
|
513
|
-
鬼の総合報酬は 0 逃亡者の総合報酬は 0
|
514
|
-
|
515
|
-
Episode 296: 5steps(avg1.34). epsilon=0.010, lr=0.100, mean q value=0.04
|
516
|
-
|
517
|
-
学習回数は 445
|
518
|
-
|
519
|
-
鬼の総合報酬は 1 逃亡者の総合報酬は -1
|
520
|
-
|
521
|
-
学習回数は 446
|
522
|
-
|
523
|
-
鬼の総合報酬は 2 逃亡者の総合報酬は -2
|
524
|
-
|
525
|
-
Episode 297: 2steps(avg1.35). epsilon=0.010, lr=0.100, mean q value=0.04
|
526
|
-
|
527
|
-
学習回数は 447
|
528
|
-
|
529
|
-
鬼の総合報酬は 1 逃亡者の総合報酬は -1
|
530
|
-
|
531
|
-
学習回数は 448
|
532
|
-
|
533
|
-
鬼の総合報酬は 2 逃亡者の総合報酬は -2
|
534
|
-
|
535
|
-
Episode 298: 2steps(avg1.36). epsilon=0.010, lr=0.100, mean q value=0.04
|
536
|
-
|
537
|
-
学習回数は 449
|
538
|
-
|
539
|
-
鬼の総合報酬は 2 逃亡者の総合報酬は -2
|
540
|
-
|
541
|
-
学習回数は 450
|
542
|
-
|
543
|
-
鬼の総合報酬は 2 逃亡者の総合報酬は -2
|
544
|
-
|
545
|
-
学習回数は 451
|
546
|
-
|
547
|
-
鬼の総合報酬は 2 逃亡者の総合報酬は -2
|
548
|
-
|
549
|
-
学習回数は 452
|
550
|
-
|
551
|
-
リセットされました!!!
|