質問編集履歴

2

画像、コードを追加しました。

2019/08/17 12:50

投稿

退会済みユーザー
test CHANGED
File without changes
test CHANGED
@@ -215,3 +215,249 @@
215
215
  self._memorize(h_s[tau], self.history[tau], reward, next_state,h_p[tau], done, h_i[tau])
216
216
 
217
217
  ```
218
+
219
+
220
+
221
+ ![イメージ説明](8cdc0c9ccdf06fc79cf6c30ca74cc6a7.jpeg)
222
+
223
+ ![イメージ説明](dcf8a255d415502e6ab4ba6bc0844e31.jpeg)
224
+
225
+ ![イメージ説明](6cb04c66bde14ec8ecbd8feb1ed7be66.jpeg)
226
+
227
+
228
+
229
+ コードを追加します。
230
+
231
+ ```python
232
+
233
+ def __init__(self, path, window_size, skip, save=False, saver_path=None, restore=False,sess=None, noise=True,norm=True):
234
+
235
+ self.path = path
236
+
237
+ self.window_size = window_size
238
+
239
+ self._preproc()
240
+
241
+ self.state_size = (None, self.window_size, self.df.shape[-1])
242
+
243
+ self.skip = skip
244
+
245
+ self.reward = reward
246
+
247
+ self.memory = Memory(self.MEMORY_SIZE)
248
+
249
+ self.mem = Memory
250
+
251
+ #
252
+
253
+ self.ent_coef = ent_coef
254
+
255
+ self.target_entropy = target_entropy
256
+
257
+ self.sess = sess
258
+
259
+ self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE,noise,norm)
260
+
261
+ self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
262
+
263
+ self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
264
+
265
+ self.icm = ICM(self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE)
266
+
267
+
268
+
269
+ with tf.variable_scope("loss"):
270
+
271
+ if self.target_entropy == 'auto':
272
+
273
+ self.target_entropy = -np.prod(self.OUTPUT_SIZE).astype(np.float32)
274
+
275
+ self.target_entropy = float(self.target_entropy)
276
+
277
+
278
+
279
+ entropy = tf.reduce_mean(self.actor.entropy)
280
+
281
+
282
+
283
+ if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
284
+
285
+ init_value = 1.0
286
+
287
+ if '_' in self.ent_coef:
288
+
289
+ init_value = float(self.ent_coef.split('_')[1])
290
+
291
+ assert init_value > 0., "The initial value of ent_coef must be greater than 0"
292
+
293
+
294
+
295
+ self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,initializer=np.log(init_value).astype(np.float32))
296
+
297
+ self.ent_coef = tf.exp(self.log_ent_coef)
298
+
299
+ else:
300
+
301
+ self.ent_coef = float(self.ent_coef)
302
+
303
+
304
+
305
+ min_qf = tf.minimum(self.critic.qf1, self.critic.qf2)
306
+
307
+ ent_coef_loss, entropy_optimizer = None, None
308
+
309
+ if not isinstance(self.ent_coef, float):
310
+
311
+ ent_coef_loss = -tf.reduce_mean(
312
+
313
+ self.log_ent_coef * tf.stop_gradient(self.actor.log_pi + self.target_entropy))
314
+
315
+
316
+
317
+ policy_kl_loss = tf.reduce_mean(self.ent_coef * self.actor.log_pi - self.critic.qf1)
318
+
319
+ policy_loss = policy_kl_loss
320
+
321
+ v_backup = tf.stop_gradient(min_qf - self.ent_coef * self.actor.log_pi)
322
+
323
+ value_loss = 0.5 * tf.reduce_mean((self.critic.value_fn - v_backup) ** 2)
324
+
325
+ value_loss += self.critic.td_loss
326
+
327
+ self.policy_loss = policy_loss
328
+
329
+ self.actor_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(policy_loss,var_list=get_vars("actor-original"))
330
+
331
+ self.vf_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(value_loss,var_list=get_vars("critic-original"))
332
+
333
+ self.entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.LEARNING_RATE,name="entropy_optimizer").minimize(ent_coef_loss,var_list=self.log_ent_coef)
334
+
335
+
336
+
337
+ self.save = save
338
+
339
+ self.saver = tf.train.Saver(tf.global_variables())
340
+
341
+ self.actor_saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "actor-original"))
342
+
343
+ self.saver_path = saver_path
344
+
345
+
346
+
347
+ if restore==True:
348
+
349
+ self.saver.restore(self.sess,self.saver_path)
350
+
351
+ self.actor_saver.restore(self.sess,"actor_sac")
352
+
353
+ else:
354
+
355
+ self.sess.run(tf.global_variables_initializer())
356
+
357
+ ```
358
+
359
+
360
+
361
+ 環境の準備
362
+
363
+ ```ここに言語を入力
364
+
365
+ def _preproc(self):
366
+
367
+ df = pd.read_csv(self.path)
368
+
369
+ X = df[["Close"]]
370
+
371
+ X = MinMaxScaler().fit_transform(X)
372
+
373
+ # X = np.asanyarray(X)
374
+
375
+
376
+
377
+ gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(X, np.asanyarray(df[["Open"]])[-len(X)::], self.window_size)
378
+
379
+ x = []
380
+
381
+ y = []
382
+
383
+ for i in gen:
384
+
385
+ x.extend(i[0].tolist())
386
+
387
+ x = np.asanyarray(x)
388
+
389
+
390
+
391
+ self.df = x[-self.STEP_SIZE::]
392
+
393
+ self.trend = np.asanyarray(df[["Open"]])[-len(self.df)::]
394
+
395
+ ```
396
+
397
+
398
+
399
+ 探査
400
+
401
+ ```ここに言語を入力
402
+
403
+ def _select_action(self, state,next_state=None):
404
+
405
+ prediction, self.init_value = self.sess.run([self.actor.logits,self.actor.last_state],
406
+
407
+ feed_dict={self.actor.X:[state],self.actor.initial_state:self.init_value})
408
+
409
+ self.pred = prediction
410
+
411
+ action = exploration(prediction[0], self.OUTPUT_SIZE,self.EPSILON)
412
+
413
+ if next_state is not None:
414
+
415
+ self.ri = self.sess.run(self.icm.ri,
416
+
417
+ feed_dict={self.icm.state:[state],self.icm.next_state:[next_state],self.icm.action:action.reshape((-1,1))})
418
+
419
+ return action
420
+
421
+ ```
422
+
423
+ エージェントの評価
424
+
425
+ ```ここに言語を入力
426
+
427
+ def buy(self, spread, pip_cost, sl):
428
+
429
+ position = 3
430
+
431
+ pip = []
432
+
433
+ states = []
434
+
435
+ spread = spread / pip_cost
436
+
437
+ loscut = False
438
+
439
+ self.init_value = np.zeros((1, 512))
440
+
441
+ for t in range(0, len(self.trend), self.skip):
442
+
443
+ state = self.get_state(t)
444
+
445
+ action = self._select_action(state)
446
+
447
+
448
+
449
+ states,pip,position = self.reward(self.trend,t,pip,action,position,states,pip_cost,spread)
450
+
451
+
452
+
453
+ if len(pip) != 0:
454
+
455
+ self.pip = np.asanyarray(pip)
456
+
457
+ total_pip = np.sum(self.pip)
458
+
459
+ # st ate = next_state
460
+
461
+ return total_pip, self.pip
462
+
463
+ ```

1

reward関数を変更しました。

2019/08/17 12:50

投稿

退会済みユーザー
test CHANGED
File without changes
test CHANGED
@@ -16,95 +16,67 @@
16
16
 
17
17
 
18
18
 
19
- def reward(trend,t,pip,action,position,states,pip_cost,spread):
20
-
21
- if action == 0:
22
-
23
- if position == 3:
24
-
25
- states = [trend[t] + spread]
26
-
27
- position = 1
28
-
29
- elif position == 1:
30
-
31
- sub = 0
32
-
33
- p = [(trend[t] - s) * pip_cost for s in states]
34
-
35
- for b in range(0,len(p)):
36
-
37
- r = [np.asanyarray([-40.0]), True] if p[b] <= -40 else [p[b], False]
38
-
39
- if r[1]:
40
-
41
- pip.extend(r[0].tolist())
42
-
43
- states.pop(b-sub)
44
-
45
- sub += 1
46
-
47
- states.append(trend[t] + spread)
48
-
49
- position = 1
50
-
51
- elif position == 2:
52
-
53
- p = [(s - trend[t]) * pip_cost for s in states]
54
-
55
- for b in p:
56
-
57
- b = np.asanyarray([-40.0]) if b <= -40 else b
58
-
59
- pip.extend(b.tolist())
60
-
61
- states = [trend[t] + spread]
62
-
63
- position = 1
64
-
65
- elif action == 1:
66
-
67
- if position == 3:
68
-
69
- states = [trend[t] - spread]
70
-
71
- position = 2
72
-
73
- elif position == 2:
74
-
75
- sub = 0
76
-
77
- p = [(s - trend[t]) * pip_cost for s in states]
78
-
79
- for b in range(0,len(p)):
80
-
81
- r = [np.asanyarray([-40.0]), True] if p[b] <= -40 else [p[b], False]
82
-
83
- if r[1]:
84
-
85
- pip.extend(r[0].tolist())
86
-
87
- states.pop(b-sub)
88
-
89
- sub += 1
90
-
91
- states.append(trend[t] - spread)
92
-
93
- position = 2
94
-
95
- elif position == 1:
96
-
97
- p = [(trend[t] - s) * pip_cost for s in states]
19
+ def reward2(double trend,list pip,int action,int position,list states,double pip_cost,double spread):
20
+
21
+ cdef list r = []
22
+
23
+ cdef int sub = 0
24
+
25
+ if position != 3:
26
+
27
+ if action == 0:
28
+
29
+ p = [(trend - s) * pip_cost for s in states]
30
+
31
+ else:
32
+
33
+ p = [(s - trend) * pip_cost for s in states]
34
+
35
+ spread *= -1
36
+
37
+
38
+
39
+ if action == position:
40
+
41
+ sub = 0
42
+
43
+ for b in range(0, len(p)):
44
+
45
+ r = [-40.0, True] if p[b] <= -40 else [p[b], False]
46
+
47
+ if r[1]:
48
+
49
+ pip.append(r[0])
50
+
51
+ states.pop(b - sub)
52
+
53
+ sub += 1
54
+
55
+ states.append(trend+ spread)
56
+
57
+ position = action
58
+
59
+ else:
98
60
 
99
61
  for b in p:
100
62
 
101
- b = np.asanyarray([-40.0]) if b <= -40 else b
63
+ b = -40.0 if b <= -40 else b
102
-
64
+
103
- pip.extend(b.tolist())
65
+ pip.append(b)
104
-
66
+
105
- states = [trend[t] - spread]
67
+ states = [trend+ spread]
106
-
68
+
107
- position = 2
69
+ position = action
70
+
71
+
72
+
73
+ else:
74
+
75
+ states = [trend + spread] if action == 0 else [trend - spread]
76
+
77
+ position = action
78
+
79
+
108
80
 
109
81
  return states,pip,position
110
82