質問編集履歴
2
画像、コードを追加しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -215,3 +215,249 @@
|
|
215
215
|
self._memorize(h_s[tau], self.history[tau], reward, next_state,h_p[tau], done, h_i[tau])
|
216
216
|
|
217
217
|
```
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
![イメージ説明](8cdc0c9ccdf06fc79cf6c30ca74cc6a7.jpeg)
|
222
|
+
|
223
|
+
![イメージ説明](dcf8a255d415502e6ab4ba6bc0844e31.jpeg)
|
224
|
+
|
225
|
+
![イメージ説明](6cb04c66bde14ec8ecbd8feb1ed7be66.jpeg)
|
226
|
+
|
227
|
+
|
228
|
+
|
229
|
+
コードを追加します。
|
230
|
+
|
231
|
+
```python
|
232
|
+
|
233
|
+
def __init__(self, path, window_size, skip, save=False, saver_path=None, restore=False,sess=None, noise=True,norm=True):
|
234
|
+
|
235
|
+
self.path = path
|
236
|
+
|
237
|
+
self.window_size = window_size
|
238
|
+
|
239
|
+
self._preproc()
|
240
|
+
|
241
|
+
self.state_size = (None, self.window_size, self.df.shape[-1])
|
242
|
+
|
243
|
+
self.skip = skip
|
244
|
+
|
245
|
+
self.reward = reward
|
246
|
+
|
247
|
+
self.memory = Memory(self.MEMORY_SIZE)
|
248
|
+
|
249
|
+
self.mem = Memory
|
250
|
+
|
251
|
+
#
|
252
|
+
|
253
|
+
self.ent_coef = ent_coef
|
254
|
+
|
255
|
+
self.target_entropy = target_entropy
|
256
|
+
|
257
|
+
self.sess = sess
|
258
|
+
|
259
|
+
self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE,noise,norm)
|
260
|
+
|
261
|
+
self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
|
262
|
+
|
263
|
+
self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
|
264
|
+
|
265
|
+
self.icm = ICM(self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE)
|
266
|
+
|
267
|
+
|
268
|
+
|
269
|
+
with tf.variable_scope("loss"):
|
270
|
+
|
271
|
+
if self.target_entropy == 'auto':
|
272
|
+
|
273
|
+
self.target_entropy = -np.prod(self.OUTPUT_SIZE).astype(np.float32)
|
274
|
+
|
275
|
+
self.target_entropy = float(self.target_entropy)
|
276
|
+
|
277
|
+
|
278
|
+
|
279
|
+
entropy = tf.reduce_mean(self.actor.entropy)
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
|
284
|
+
|
285
|
+
init_value = 1.0
|
286
|
+
|
287
|
+
if '_' in self.ent_coef:
|
288
|
+
|
289
|
+
init_value = float(self.ent_coef.split('_')[1])
|
290
|
+
|
291
|
+
assert init_value > 0., "The initial value of ent_coef must be greater than 0"
|
292
|
+
|
293
|
+
|
294
|
+
|
295
|
+
self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,initializer=np.log(init_value).astype(np.float32))
|
296
|
+
|
297
|
+
self.ent_coef = tf.exp(self.log_ent_coef)
|
298
|
+
|
299
|
+
else:
|
300
|
+
|
301
|
+
self.ent_coef = float(self.ent_coef)
|
302
|
+
|
303
|
+
|
304
|
+
|
305
|
+
min_qf = tf.minimum(self.critic.qf1, self.critic.qf2)
|
306
|
+
|
307
|
+
ent_coef_loss, entropy_optimizer = None, None
|
308
|
+
|
309
|
+
if not isinstance(self.ent_coef, float):
|
310
|
+
|
311
|
+
ent_coef_loss = -tf.reduce_mean(
|
312
|
+
|
313
|
+
self.log_ent_coef * tf.stop_gradient(self.actor.log_pi + self.target_entropy))
|
314
|
+
|
315
|
+
|
316
|
+
|
317
|
+
policy_kl_loss = tf.reduce_mean(self.ent_coef * self.actor.log_pi - self.critic.qf1)
|
318
|
+
|
319
|
+
policy_loss = policy_kl_loss
|
320
|
+
|
321
|
+
v_backup = tf.stop_gradient(min_qf - self.ent_coef * self.actor.log_pi)
|
322
|
+
|
323
|
+
value_loss = 0.5 * tf.reduce_mean((self.critic.value_fn - v_backup) ** 2)
|
324
|
+
|
325
|
+
value_loss += self.critic.td_loss
|
326
|
+
|
327
|
+
self.policy_loss = policy_loss
|
328
|
+
|
329
|
+
self.actor_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(policy_loss,var_list=get_vars("actor-original"))
|
330
|
+
|
331
|
+
self.vf_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(value_loss,var_list=get_vars("critic-original"))
|
332
|
+
|
333
|
+
self.entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.LEARNING_RATE,name="entropy_optimizer").minimize(ent_coef_loss,var_list=self.log_ent_coef)
|
334
|
+
|
335
|
+
|
336
|
+
|
337
|
+
self.save = save
|
338
|
+
|
339
|
+
self.saver = tf.train.Saver(tf.global_variables())
|
340
|
+
|
341
|
+
self.actor_saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "actor-original"))
|
342
|
+
|
343
|
+
self.saver_path = saver_path
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
if restore==True:
|
348
|
+
|
349
|
+
self.saver.restore(self.sess,self.saver_path)
|
350
|
+
|
351
|
+
self.actor_saver.restore(self.sess,"actor_sac")
|
352
|
+
|
353
|
+
else:
|
354
|
+
|
355
|
+
self.sess.run(tf.global_variables_initializer())
|
356
|
+
|
357
|
+
```
|
358
|
+
|
359
|
+
|
360
|
+
|
361
|
+
環境の準備
|
362
|
+
|
363
|
+
```ここに言語を入力
|
364
|
+
|
365
|
+
def _preproc(self):
|
366
|
+
|
367
|
+
df = pd.read_csv(self.path)
|
368
|
+
|
369
|
+
X = df[["Close"]]
|
370
|
+
|
371
|
+
X = MinMaxScaler().fit_transform(X)
|
372
|
+
|
373
|
+
# X = np.asanyarray(X)
|
374
|
+
|
375
|
+
|
376
|
+
|
377
|
+
gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(X, np.asanyarray(df[["Open"]])[-len(X)::], self.window_size)
|
378
|
+
|
379
|
+
x = []
|
380
|
+
|
381
|
+
y = []
|
382
|
+
|
383
|
+
for i in gen:
|
384
|
+
|
385
|
+
x.extend(i[0].tolist())
|
386
|
+
|
387
|
+
x = np.asanyarray(x)
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
self.df = x[-self.STEP_SIZE::]
|
392
|
+
|
393
|
+
self.trend = np.asanyarray(df[["Open"]])[-len(self.df)::]
|
394
|
+
|
395
|
+
```
|
396
|
+
|
397
|
+
|
398
|
+
|
399
|
+
探査
|
400
|
+
|
401
|
+
```ここに言語を入力
|
402
|
+
|
403
|
+
def _select_action(self, state,next_state=None):
|
404
|
+
|
405
|
+
prediction, self.init_value = self.sess.run([self.actor.logits,self.actor.last_state],
|
406
|
+
|
407
|
+
feed_dict={self.actor.X:[state],self.actor.initial_state:self.init_value})
|
408
|
+
|
409
|
+
self.pred = prediction
|
410
|
+
|
411
|
+
action = exploration(prediction[0], self.OUTPUT_SIZE,self.EPSILON)
|
412
|
+
|
413
|
+
if next_state is not None:
|
414
|
+
|
415
|
+
self.ri = self.sess.run(self.icm.ri,
|
416
|
+
|
417
|
+
feed_dict={self.icm.state:[state],self.icm.next_state:[next_state],self.icm.action:action.reshape((-1,1))})
|
418
|
+
|
419
|
+
return action
|
420
|
+
|
421
|
+
```
|
422
|
+
|
423
|
+
エージェントの評価
|
424
|
+
|
425
|
+
```ここに言語を入力
|
426
|
+
|
427
|
+
def buy(self, spread, pip_cost, sl):
|
428
|
+
|
429
|
+
position = 3
|
430
|
+
|
431
|
+
pip = []
|
432
|
+
|
433
|
+
states = []
|
434
|
+
|
435
|
+
spread = spread / pip_cost
|
436
|
+
|
437
|
+
loscut = False
|
438
|
+
|
439
|
+
self.init_value = np.zeros((1, 512))
|
440
|
+
|
441
|
+
for t in range(0, len(self.trend), self.skip):
|
442
|
+
|
443
|
+
state = self.get_state(t)
|
444
|
+
|
445
|
+
action = self._select_action(state)
|
446
|
+
|
447
|
+
|
448
|
+
|
449
|
+
states,pip,position = self.reward(self.trend,t,pip,action,position,states,pip_cost,spread)
|
450
|
+
|
451
|
+
|
452
|
+
|
453
|
+
if len(pip) != 0:
|
454
|
+
|
455
|
+
self.pip = np.asanyarray(pip)
|
456
|
+
|
457
|
+
total_pip = np.sum(self.pip)
|
458
|
+
|
459
|
+
# st ate = next_state
|
460
|
+
|
461
|
+
return total_pip, self.pip
|
462
|
+
|
463
|
+
```
|
1
reward関数を変更しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -16,95 +16,67 @@
|
|
16
16
|
|
17
17
|
|
18
18
|
|
19
|
-
def reward(trend,t
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
e
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
p = [(trend
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
p
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
states = [trend[t] + spread]
|
62
|
-
|
63
|
-
position = 1
|
64
|
-
|
65
|
-
elif action == 1:
|
66
|
-
|
67
|
-
if position == 3:
|
68
|
-
|
69
|
-
states = [trend[t] - spread]
|
70
|
-
|
71
|
-
position = 2
|
72
|
-
|
73
|
-
elif position == 2:
|
74
|
-
|
75
|
-
sub = 0
|
76
|
-
|
77
|
-
p = [(s - trend[t]) * pip_cost for s in states]
|
78
|
-
|
79
|
-
for b in range(0,len(p)):
|
80
|
-
|
81
|
-
r = [np.asanyarray([-40.0]), True] if p[b] <= -40 else [p[b], False]
|
82
|
-
|
83
|
-
if r[1]:
|
84
|
-
|
85
|
-
pip.extend(r[0].tolist())
|
86
|
-
|
87
|
-
states.pop(b-sub)
|
88
|
-
|
89
|
-
sub += 1
|
90
|
-
|
91
|
-
states.append(trend[t] - spread)
|
92
|
-
|
93
|
-
position = 2
|
94
|
-
|
95
|
-
elif position == 1:
|
96
|
-
|
97
|
-
p = [(trend[t] - s) * pip_cost for s in states]
|
19
|
+
def reward2(double trend,list pip,int action,int position,list states,double pip_cost,double spread):
|
20
|
+
|
21
|
+
cdef list r = []
|
22
|
+
|
23
|
+
cdef int sub = 0
|
24
|
+
|
25
|
+
if position != 3:
|
26
|
+
|
27
|
+
if action == 0:
|
28
|
+
|
29
|
+
p = [(trend - s) * pip_cost for s in states]
|
30
|
+
|
31
|
+
else:
|
32
|
+
|
33
|
+
p = [(s - trend) * pip_cost for s in states]
|
34
|
+
|
35
|
+
spread *= -1
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
if action == position:
|
40
|
+
|
41
|
+
sub = 0
|
42
|
+
|
43
|
+
for b in range(0, len(p)):
|
44
|
+
|
45
|
+
r = [-40.0, True] if p[b] <= -40 else [p[b], False]
|
46
|
+
|
47
|
+
if r[1]:
|
48
|
+
|
49
|
+
pip.append(r[0])
|
50
|
+
|
51
|
+
states.pop(b - sub)
|
52
|
+
|
53
|
+
sub += 1
|
54
|
+
|
55
|
+
states.append(trend+ spread)
|
56
|
+
|
57
|
+
position = action
|
58
|
+
|
59
|
+
else:
|
98
60
|
|
99
61
|
for b in p:
|
100
62
|
|
101
|
-
b =
|
63
|
+
b = -40.0 if b <= -40 else b
|
102
|
-
|
64
|
+
|
103
|
-
pip.e
|
65
|
+
pip.append(b)
|
104
|
-
|
66
|
+
|
105
|
-
states = [trend
|
67
|
+
states = [trend+ spread]
|
106
|
-
|
68
|
+
|
107
|
-
position =
|
69
|
+
position = action
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
else:
|
74
|
+
|
75
|
+
states = [trend + spread] if action == 0 else [trend - spread]
|
76
|
+
|
77
|
+
position = action
|
78
|
+
|
79
|
+
|
108
80
|
|
109
81
|
return states,pip,position
|
110
82
|
|