前提・実現したいこと
pythonで強化学習の簡単なコードを作っています。
numpyの配列について以下のエラーが出てしまいました。
どこがおかしいか教えていただけるとありがたいです。
発生している問題・エラーメッセージ
Traceback (most recent call last): File "C:\~\rl_study.py", line 185, in <module> reward = sim.run(train=True) File "C:\~\rl_study.py", line 172, in run self.agent.update_model(old_seq, action, reward) File "C:\~\rl_study.py", line 105, in update_model x = Variable(batch[:, 0:1].reshape((self.batch_num, -1)).astype(np.float32)) IndexError: too many indices for array
該当のソースコード
python 3.5.2 ソースコード import chainer from chainer import Variable,optimizers,serializers from chainer import Chain import chainer.functions as F import chainer.links as L import numpy as np np.random.seed(0) # 本問のニューラルネット class Q(Chain): def __init__(self): super(Q, self).__init__( l1=L.Linear(1,16), l2=L.Linear(16,10), ) def __call__(self, x, t): return F.mean_squared_error(self.predict(x,train=True),t) def predict(self,x,train=False): h1 = F.leaky_relu(self.l1(x)) y = F.leaky_relu(self.l2(h1)) return y # DQNアルゴリズムにしたがって動作するエージェント class DQNAgent(): def __init__(self,epsilon = 0.99): self.model = Q() self.optimizer = optimizers.Adam() self.optimizer.setup(self.model) self.epsilon = epsilon self.actions = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] # 行動の選択肢 self.experienceMemory = [] self.memSize = 20*10 #(20サンプリング×10エピソード) self.experienceMemory_local = [] self.memPos = 0 self.batch_num = 5 self.gamma = 1.0 self.loss = 0 self.reward_award = np.ones(10) * -1 def get_action_value(self, one_step_seq): x = Variable(np.hstack([one_step_seq]).astype(np.float32).reshape((1,-1))) return self.model.predict(x).data[0] def get_greedy_action(self,one_step_seq): x = Variable(np.hstack([one_step_seq]).astype(np.float32).reshape((1,-1))) return self.model.predict(x).data[0] def reduce_epsilon(self): self.epsilon -= 1.0/1000 def get_epsilon(self): return self.epsilon def get_action(self, one_step_seq, train): action = 0 if train == True and np.random.random() < self.epsilon: action = np.random.choice(self.actions) else: action = self.get_greedy_action(one_step_seq) return action def expereince_local(self, one_step_seq, action, reward): self.experienceMemory_local.append(np.hstack([one_step_seq,action,reward])) def experience_global(self, reward): # ベスト10に入る記憶を取り込む if np.min(self.reward_award) < reward: i = np.argmin(self.reward_award) self.reward_award[i] = reward for x in self.experienceMemory_local: self.experience(x) if np.random.random() < 0.01: for x in self.experienceMemory_local: self.experience(x) self.experienceMemory_local = [] def experience(self,x): if len(self.experienceMemory) > self.memSize: self.experienceMemory[int(self.memPos % self.memSize)] = x self.memPos += 1 else: self.experienceMemory.append(x) def update_model(self, one_step_seq, action, reward): """ モデルを更新する """ # 経験メモリにたまってない場合は更新しない if len(self.experienceMemory) < self.batch_num: return # 経験メモリからバッチを作成 memsize = len(self.experienceMemory) batch_index = list(np.random.randint(0,memsize,(self.batch_num))) batch = np.array([self.experienceMemory[i] for i in batch_index]) #x = Variable(batch[:, 0:1].reshape((self.batch_num, -1)).astype(np.float32)) x = Variable(batch[:, 0:1].reshape((self.batch_num, -1)).astype(np.float32)) targets = self.model.predict(x).data.copy() for i in range(self.batch_num): a = batch[i, 1] r = batch[i, 1 + 1] ai = int(10*a-1) targets[i,ai] = (r + self.gamma * np.max(self.get_action_value(a))) t = Variable(np.array(targets).reshape((self.batch_num, -1)).astype(np.float32)) self.model.cleargrads() loss = self.model(x,t) self.loss = loss.data loss.backward() self.optimizer.update() class environment(): ''' 本問の環境。仮に0.5を入力したら最大の報酬を返すものとする ''' def __init__(self): self.reset(0) def reset(self, power): self.power = power def get_reward(self): reward = 0 #desired_power = 0.5 reward = np.min([1 - self.power, self.power]) return reward class simulator: def __init__(self,environment:environment, agent:DQNAgent): self.agent = agent self.env = environment self.num_seq =1 self.reset_seq() self.learning_rate = 1.0 def reset_seq(self): self.seq = np.random.choice(self.agent.actions) def replace_seq(self, power): self.seq[0] = power def run(self,train = True): self.reset_seq() reward = 0 old_seq = self.seq.copy() action = self.agent.get_action(old_seq, train) self.env.reset(old_seq) reward = self.env.get_reward() self.agent.expereince_local(old_seq, action, reward) self.agent.experience_global(reward) if train: self.agent.update_model(old_seq, action, reward) self.agent.reduce_epsilon() return reward if __name__ == '__main__': agent = DQNAgent() env = environment() sim = simulator(env,agent) test_highscore = 0 for i in range(100): reward = sim.run(train=True) if i % 10 == 0: serializers.save_npz('0401model/%03d.model' % i, agent.model) if i % 5 == 0: reward = sim.run(train = False) if test_highscore < reward: print("highcore!") serializers.save_npz('0401model/%03d_hs.model' % i ,agent.model) test_highscore = reward print(i) print(reward) print("epsilon: %2.2e" % agent.get_epsilon()) print("loss:%2.2e" % agent.loss)
試したこと
参考にしたコード(https://github.com/ashitani/DQN_pendulum)
と比較しましたが、どこが問題なのかわかりませんでした。
補足情報(FW/ツールのバージョンなど)
Numpy 1.13.3
回答1件
あなたの回答
tips
プレビュー