編集履歴

質問編集履歴

画像、コードを追加しました。

2019/08/17 12:50

投稿

退会済みユーザー

スコア0

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -106,4 +106,127 @@
                 if tau >= 0:
                   reward = self.discount_rewards([r for r in h_r[tau+1:tau+n]])
                   self._memorize(h_s[tau], self.history[tau], reward, next_state,h_p[tau], done, h_i[tau])
+```
+![イメージ説明](8cdc0c9ccdf06fc79cf6c30ca74cc6a7.jpeg)
+![イメージ説明](dcf8a255d415502e6ab4ba6bc0844e31.jpeg)
+![イメージ説明](6cb04c66bde14ec8ecbd8feb1ed7be66.jpeg)
+コードを追加します。
+```python
+    def __init__(self, path, window_size, skip, save=False, saver_path=None, restore=False,sess=None, noise=True,norm=True):
+        self.path = path
+        self.window_size = window_size
+        self._preproc()
+        self.state_size = (None, self.window_size, self.df.shape[-1])
+        self.skip = skip
+        self.reward = reward
+        self.memory = Memory(self.MEMORY_SIZE)
+        self.mem = Memory
+        #
+        self.ent_coef = ent_coef
+        self.target_entropy = target_entropy
+        self.sess = sess
+        self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE,noise,norm)
+        self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
+        self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm)
+        self.icm = ICM(self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE)
+        with tf.variable_scope("loss"):
+          if self.target_entropy == 'auto':
+            self.target_entropy = -np.prod(self.OUTPUT_SIZE).astype(np.float32)
+          self.target_entropy = float(self.target_entropy)
+          entropy = tf.reduce_mean(self.actor.entropy)
+          if isinstance(self.ent_coef, str) and self.ent_coef.startswith('auto'):
+            init_value = 1.0
+            if '_' in self.ent_coef:
+              init_value = float(self.ent_coef.split('_')[1])
+              assert init_value > 0., "The initial value of ent_coef must be greater than 0"
+            self.log_ent_coef = tf.get_variable('log_ent_coef', dtype=tf.float32,initializer=np.log(init_value).astype(np.float32))
+            self.ent_coef = tf.exp(self.log_ent_coef)
+          else:
+            self.ent_coef = float(self.ent_coef)
+          min_qf = tf.minimum(self.critic.qf1, self.critic.qf2)
+          ent_coef_loss, entropy_optimizer = None, None
+          if not isinstance(self.ent_coef, float):
+            ent_coef_loss = -tf.reduce_mean(
+                self.log_ent_coef * tf.stop_gradient(self.actor.log_pi + self.target_entropy))
+          policy_kl_loss = tf.reduce_mean(self.ent_coef * self.actor.log_pi - self.critic.qf1)
+          policy_loss = policy_kl_loss
+          v_backup = tf.stop_gradient(min_qf - self.ent_coef * self.actor.log_pi)
+          value_loss = 0.5 * tf.reduce_mean((self.critic.value_fn - v_backup) ** 2)
+          value_loss += self.critic.td_loss
+          self.policy_loss = policy_loss
+          self.actor_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(policy_loss,var_list=get_vars("actor-original"))
+          self.vf_optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE,name="actor_optimizer").minimize(value_loss,var_list=get_vars("critic-original"))
+          self.entropy_optimizer = tf.train.AdamOptimizer(learning_rate=self.LEARNING_RATE,name="entropy_optimizer").minimize(ent_coef_loss,var_list=self.log_ent_coef)
+        self.save = save
+        self.saver = tf.train.Saver(tf.global_variables())
+        self.actor_saver = tf.train.Saver(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, "actor-original"))
+        self.saver_path = saver_path
+        if restore==True:
+          self.saver.restore(self.sess,self.saver_path)
+          self.actor_saver.restore(self.sess,"actor_sac")
+        else:
+          self.sess.run(tf.global_variables_initializer())
+```
+環境の準備
+```ここに言語を入力
+    def _preproc(self):
+        df = pd.read_csv(self.path)
+        X = df[["Close"]]
+        X = MinMaxScaler().fit_transform(X)
+#         X = np.asanyarray(X)
+        gen = tf.keras.preprocessing.sequence.TimeseriesGenerator(X, np.asanyarray(df[["Open"]])[-len(X)::], self.window_size)
+        x = []
+        y = []
+        for i in gen:
+            x.extend(i[0].tolist())
+        x = np.asanyarray(x)
+        self.df = x[-self.STEP_SIZE::]
+        self.trend = np.asanyarray(df[["Open"]])[-len(self.df)::]
+```
+探査
+```ここに言語を入力
+    def _select_action(self, state,next_state=None):
+        prediction, self.init_value = self.sess.run([self.actor.logits,self.actor.last_state],
+                                                     feed_dict={self.actor.X:[state],self.actor.initial_state:self.init_value})
+        self.pred = prediction
+        action = exploration(prediction[0], self.OUTPUT_SIZE,self.EPSILON)
+        if next_state is not None:
+          self.ri = self.sess.run(self.icm.ri,
+                                  feed_dict={self.icm.state:[state],self.icm.next_state:[next_state],self.icm.action:action.reshape((-1,1))})
+        return action
+```
+エージェントの評価
+```ここに言語を入力
+    def buy(self, spread, pip_cost, sl):
+        position = 3
+        pip = []
+        states = []
+        spread = spread / pip_cost
+        loscut = False
+        self.init_value = np.zeros((1, 512))
+        for t in  range(0, len(self.trend), self.skip):
+                state = self.get_state(t)
+                action = self._select_action(state)
+                states,pip,position = self.reward(self.trend,t,pip,action,position,states,pip_cost,spread)
+                if len(pip) != 0:
+                  self.pip = np.asanyarray(pip)
+                  total_pip = np.sum(self.pip)
+              # st ate = next_state
+        return total_pip, self.pip
 ```

reward関数を変更しました。

2019/08/17 12:50

投稿

退会済みユーザー

スコア0

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -7,51 +7,37 @@
 ```python
 import numpy as np
-def reward(trend,t,pip,action,position,states,pip_cost,spread):
+def reward2(double trend,list pip,int action,int position,list states,double pip_cost,double spread):
+    cdef list r = []
+    cdef int sub = 0
+    if position != 3:
-    if action == 0:
+        if action == 0:
+            p = [(trend - s) * pip_cost for s in states]
+        else:
+            p = [(s - trend) * pip_cost for s in states]
+            spread *= -1
-        if position == 3:
+        if action == position:
-            states = [trend[t] + spread]
-            position = 1
-        elif position == 1:
-          sub = 0
+            sub = 0
-          p = [(trend[t] - s) * pip_cost for s in states]
-          for b in range(0,len(p)):
+            for b in range(0, len(p)):
-            r = [np.asanyarray([-40.0]), True] if p[b] <= -40 else [p[b], False]
+                r = [-40.0, True] if p[b] <= -40 else [p[b], False]
-            if r[1]:
+                if r[1]:
-              pip.extend(r[0].tolist())
+                    pip.append(r[0])
-              states.pop(b-sub)
+                    states.pop(b - sub)
-              sub += 1
+                    sub += 1
-            states.append(trend[t] + spread)
+            states.append(trend+ spread)
-            position = 1
+            position = action
-        elif position == 2:
-          p = [(s - trend[t]) * pip_cost for s in states]
-          for b in p:
-            b = np.asanyarray([-40.0]) if b <= -40 else b
-            pip.extend(b.tolist())
-          states = [trend[t] + spread]
-          position = 1
-    elif action == 1:
-        if position == 3:
-            states = [trend[t] - spread]
-            position = 2
-        elif position == 2:
-          sub = 0
-          p = [(s - trend[t]) * pip_cost for s in states]
-          for b in range(0,len(p)):
-            r = [np.asanyarray([-40.0]), True] if p[b] <= -40 else [p[b], False]
-            if r[1]:
+        else:
-              pip.extend(r[0].tolist())
-              states.pop(b-sub)
-              sub += 1
-          states.append(trend[t] - spread)
-          position = 2
-        elif position == 1:
-            p = [(trend[t] - s) * pip_cost for s in states]
             for b in p:
-              b = np.asanyarray([-40.0]) if b <= -40 else b
+              b = -40.0 if b <= -40 else b
-              pip.extend(b.tolist())
+              pip.append(b)
-            states = [trend[t] - spread]
+            states = [trend+ spread]
-            position = 2
+            position = action
+    else:
+        states = [trend + spread] if action == 0 else [trend - spread]
+        position = action
     return states,pip,position
 ```
 アクションの決定