前提・実現したいこと
tensorflowにおいて保存したチェックポイントに基づいてトレーニングを再開または評価したいのですが、何度実行しても、一回目の報酬は似通った値になります。
発生している問題
該当のソースコード
python
1 if restore==False: 2 self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE,noise,norm) 3 self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE,noise,norm) 4 self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm) 5 self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LEARNING_RATE,noise,norm) 6 self.actions = tf.placeholder(tf.float32, (None,self.OUTPUT_SIZE),name="actions") 7 self.log_pi = tf.placeholder(tf.float32, (None,),name="log_pi") 8 self.entropy = tf.placeholder(tf.float32, (None,),name="entropy") 9 else: 10 saver = tf.train.import_meta_graph(tf.train.latest_checkpoint('./') + ".meta") 11 saver.restore(self.sess, tf.train.latest_checkpoint('./')) 12 self.actor = Actor_Restore("actor-original") 13 self.actor_target = Actor_Restore("actor-target") 14 self.critic = Critic_Restore("critic-original",self.LEARNING_RATE) 15 self.critic_target = Critic_Restore("critic-target",self.LEARNING_RATE) 16 self.actions = tf.get_default_graph().get_tensor_by_name("actions:0") 17 self.log_pi = tf.get_default_graph().get_tensor_by_name("log_pi:0") 18 self.entropy = tf.get_default_graph().get_tensor_by_name("entropy:0")
変数の初期化は初期化されていない変数だけ行っています。
アクション選択コード
python
1def exploration(prediction,output_size): 2 prediction = prediction.astype("float64") 3 4 if np.random.rand() < 0.3: 5 tau=2.0 6 clip=(-250.0, 250.0) 7 exp_values = np.exp(np.clip(prediction / tau, clip[0], clip[1])) 8 probs = exp_values / np.sum(exp_values) 9 action = np.random.choice(range(output_size), p=probs) 10 else: 11 action = np.argmax(prediction) 12 13 return action
trainingコード
python
1 def train(self, iterations, checkpoint, spread, pip_cost, n=4): 2 copy = 0 3 for i in range(iterations): 4 position = 3 5 total_pip = 0.00 6 max_pip = 0 7 pip = [] 8 spread = spread / pip_cost 9 done = False 10 states = [] 11 p = [] 12 h_s = [] 13 h_r = [] 14 h_i = [] 15 self.init_value = np.zeros((1, 512)) 16 tau = 0 17 old_reword = 0.0 18 self.history = [] 19 self.MEMORIES = deque() 20 self.pip = [] 21 if (copy + 1) % 4 == 0: 22 self._assign('actor-original', 'actor-target') 23 self._assign('critic-original', 'critic-target') 24 25 for t in range(0, len(self.trend)-1, self.skip): 26 state = self.get_state(t) 27 h_s.append(state) 28 action = self._select_action(state) 29 self.history.append(action) 30 h_i.append(self.init_value[0]) 31 next_state = self.get_state(t + 1) 32 # print(t) 33 states,pip,position = self.reward(self.trend,t,pip,action,position,states,pip_cost,spread) 34 35 if len(pip) != 0: 36 self.pip = np.asanyarray(pip) 37 total_pip = np.sum(self.pip) 38 mean_pip = total_pip / (t + 1) 39 reward = mean_pip - old_reword 40 old_reword = mean_pip 41 h_r.append(reward)
あなたの回答
tips
プレビュー