前提
pytorchでactor-criticの実装をしています.
元のコードはhttps://github.com/oreilly-japan/deep-learning-from-scratch-4/blob/master/pytorch/actor_critic.py を参考にし, enviromentの部分のみを変更している.
しかし, loss_pi.backward()の際にin-placeエラーがでてしまい, 実行できない.
実現したいこと
in-placeエラーを出さずにactor-criticを実行する
発生している問題・エラーメッセージ
File "ac.py", line 192, in <module>
simulation(device)
File "ac.py", line 173, in simulation
action, action_prob = agent.get_action(state, episode)
File "ac.py", line 65, in get_action
probs = self.pi(state)
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "ac.py", line 29, in forward
x = F.relu(self.l1(x))
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/nn/modules/linear.py", line 103, in forward
return F.linear(input, self.weight, self.bias)
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/nn/functional.py", line 1848, in linear
return torch._C._nn.linear(input, weight, bias)
(function _print_stack)
Traceback (most recent call last):
File "ac.py", line 192, in <module>
simulation(device)
File "ac.py", line 177, in simulation
agent.update(state, action_prob, reward, next_state, done)
File "ac.py", line 85, in update
loss_pi.backward()
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/○○/.pyenv/versions/miniconda3-latest/envs/○○/lib/python3.7/site-packages/torch/autograd/init.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1, 68340]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
該当のソースコード
python
1class PolicyNet(torch.nn.Module): 2 def __init__(self, data, hidden_dim=128): 3 super().__init__() 4 self.action_size = data['word'].x.size()[0] 5 self.emb_dim = data['word'].x.size()[1] + data['spot'].x.size()[1] 6 self.hidden_dim = hidden_dim 7 self.l1 = nn.Linear(self.action_size, self.hidden_dim) 8 self.l2 = nn.Linear(self.hidden_dim, self.action_size) 9 10 def forward(self, x): 11 x = F.relu(self.l1(x)) 12 x = self.l2(x) 13 x = F.softmax(x, dim=1) 14 return x 15 16class ValueNet(torch.nn.Module): 17 def __init__(self, data, hidden_dim=128): 18 super().__init__() 19 self.action_size = data['word'].x.size()[0] 20 self.hidden_dim = hidden_dim 21 self.l1 = nn.Linear(self.action_size, self.hidden_dim) 22 self.l2 = nn.Linear(self.hidden_dim, 1) 23 24 def forward(self, x): 25 x = F.relu(self.l1(x)) 26 x = self.l2(x) 27 return x 28 29class Agent: 30 def __init__(self, data, device): 31 self.gamma = 0.98 32 self.lr_pi = 2e-4 33 self.lr_v = 5e-4 34 self.action_size = data['word'].x.size()[1] 35 36 self.pi = PolicyNet(data).to(device) 37 self.v = ValueNet(data).to(device) 38 self.optimizer_pi = Adam(self.pi.parameters(), self.lr_pi) 39 self.optimizer_v = Adam(self.v.parameters(), self.lr_v) 40 self.data = data 41 self.device = device 42 43 def get_action(self, state): 44 state = state.unsqueeze(0) 45 probs = self.pi(state) 46 probs = probs[0] 47 m = Categorical(probs) 48 action = m.sample().item() 49 return action, probs[action] 50 51 def update(self, state, action_prob, reward, next_state, done): 52 state = state.unsqueeze(0) 53 next_state = next_state.unsqueeze(0) 54 target = reward + self.gamma * self.v(next_state) 55 target.detach() 56 v = self.v(state) 57 loss_fn = nn.MSELoss() 58 loss_v = loss_fn(v, target) 59 60 delta = target-v 61 loss_pi = -torch.log(action_prob)*delta.item() 62 self.optimizer_v.zero_grad() 63 self.optimizer_pi.zero_grad() 64 loss_v.backward() 65 loss_pi.backward() 66 self.optimizer_v.step() 67 self.optimizer_pi.step() 68 69class MyEnv: 70 def __init__(self, data, device): 71 #省略 72 self.device = device 73 self.original_data = data 74 self.data = data 75 76 def reset(self): 77 #省略 新しい環境を返す 78 return self.state 79 80 @torch.no_grad() 81 def step(self, action): 82 #省略 別のモデルを使ってactionが与えられたときの新しい環境と報酬を返す 83 return next_state, reward, done, info 84 85 86 87 88 89def simulation(device): 90 episodes = 1000000 91 92 env = MyEnv(data, device) 93 agent = Agent(data, device) 94 reward_history = [] 95 step=0 96 for episode in range(episodes): 97 state = env.reset() 98 done = False 99 total_reward = 0 100 101 while not done: 102 action, action_prob = agent.get_action(state) 103 next_state, reward, done, info = env.step(action) 104 agent.update(state, action_prob, reward, next_state, done) 105 state = next_state 106 total_reward+=reward 107 108 reward_history.append(total_reward) 109
試したこと
loss_pi.backward()でのエラーであることとtorch.cuda.FloatTensor [1, 68340]はstateやprobsのサイズ(つまりaction size)であることからPolicyNetのforwardおよびagent.get_action, agent.updateに問題があると考えているが参考にしているコードとほぼ同じコードなので, 原因が特定できないでいる. また, MyEnvはDQNを実装したときに問題なかったのでMyEnvの問題でもないと思われる
若干投げる感じになって申し訳ないのですがなにか分かる方がいれば助言ください,,
補足情報(FW/ツールのバージョンなど)
ここにより詳細な情報を記載してください。
pytorch '1.10.2+cu113'
python 3.7.13
