CUDA error: device-side assert triggered

Question

num_epochs = 10
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}
train_loss_list, val_loss_list = train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs)
を実行すると以下のエラーが出ます。

クラスを２に指定して、二値分類のセグメンテーションを行うことが目標です。

参考にした Webページの内容より、可能性の一つはクラス数とネットワークモデルの入出力の不一致とありましたが、その方は大丈夫でした。

エラー文より、損失関数の計算の箇所で問題がありそうです。
対処の仕方がわからず困っています。解決のヒントをお願いします。

loss = criterion(outputs, anno_class_imges.long()) / batch_multiplier

```Python
RuntimeError                              Traceback (most recent call last)
Input In [15], in <cell line: 3>()
      1 num_epochs = 10
      2 dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}
----> 3 train_loss_list, val_loss_list = train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs)

Input In [14], in train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs)
     72 with torch.set_grad_enabled(phase == 'train'):
     73     outputs = net(imges)
---> 74     loss = criterion(outputs, anno_class_imges.long()) / batch_multiplier
     76     # 訓練時はバックプロパゲーション
     77     if phase == 'train':

File ~\anaconda3\envs\copy38\lib\site-packages	orch
n\modules\module.py:1110, in Module._call_impl(self, *input, **kwargs)
   1106 # If we don't have any hooks, we want to skip the rest of the logic in
   1107 # this function, and just call forward.
   1108 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1109         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1110     return forward_call(*input, **kwargs)
   1111 # Do not call functions when jit is used
   1112 full_backward_hooks, non_full_backward_hooks = [], []

Input In [12], in PSPLoss.forward(self, outputs, targets)
      8 def forward(self, outputs, targets):
      9     """
     10    損失関数の計算。
     11 
   (...)
     23        損失の値
     24    """
---> 26     loss = F.cross_entropy(outputs[0], targets, reduction='mean')
     27     loss_aux = F.cross_entropy(outputs[1], targets, reduction='mean')
     29     return loss+self.aux_weight*loss_aux

File ~\anaconda3\envs\copy38\lib\site-packages	orch
n\functional.py:2996, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   2994 if size_average is not None or reduce is not None:
   2995     reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2996 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

RuntimeError: CUDA error: device-side assert triggered

```

使用したコードの一部を貼ります

```Python
#損失関数
class PSPLoss(nn.Module):

def __init__(self, aux_weight=0.4):
        super(PSPLoss, self).__init__()
        self.aux_weight = aux_weight  # aux_lossの重み

def forward(self, outputs, targets):

loss = F.cross_entropy(outputs[0], targets, reduction='mean')
        loss_aux = F.cross_entropy(outputs[1], targets, reduction='mean')

return loss+self.aux_weight*loss_aux

criterion = PSPLoss(aux_weight=0.4)

```

```Python
def train_model(net, dataloaders_dict, criterion, scheduler, optimizer, num_epochs):

train_loss_list = []
    val_loss_list = []
    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("使用デバイス：", device)
   
    # ネットワークをGPUへ
    net.to(device)

# ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

# 画像の枚数
    num_train_imgs = 100
    num_val_imgs = 30
    batch_size = 4

# イテレーションカウンタをセット
    iteration = 1
    logs = []

# multiple minibatch
    batch_multiplier = 3

# epochのループ
    for epoch in range(num_epochs):

# 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()
        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

# epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                scheduler.step()  # 最適化schedulerの更新
                optimizer.zero_grad()
                print('（train）')

else:
                net.eval()   # モデルを検証モードに
                print('-------------')
                print('（val）')

# データローダーからminibatchずつ取り出すループ
            count = 0  # multiple minibatch
            for imges, anno_class_imges in dataloaders_dict[phase]:
                # ミニバッチがサイズが1だと、バッチノーマライゼーションでエラーになるのでさける
                if imges.size()[0] == 1:
                    continue

# GPUが使えるならGPUにデータを送る
                imges = imges.to(device, dtype=torch.float)
                anno_class_imges = anno_class_imges.to(device, dtype=torch.float)

# multiple minibatchでのパラメータの更新
                if (phase == 'train') and (count == 0):
                    optimizer.step()
                    optimizer.zero_grad()
                    count = batch_multiplier

# 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(imges)
                    loss = criterion(outputs, anno_class_imges.long()) / batch_multiplier

# 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算
                        count -= 1  # multiple minibatch

if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iteration {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item()/batch_size*batch_multiplier, duration))
                            t_iter_start = time.time()

epoch_train_loss += loss.item() * batch_multiplier
                        iteration += 1

# 検証時
                    else:
                        epoch_val_loss += loss.item() * batch_multiplier
      
        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss/num_train_imgs, epoch_val_loss/num_val_imgs))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

#lossの可視化のリスト
        train_loss_list.append(epoch_train_loss/num_train_imgs)
        val_loss_list.append(epoch_val_loss/num_val_imgs)
        if ((epoch+1) % 5 == 0):
          torch.save(net.state_dict(), 'weights/pspnet50_' + str(epoch+1) + '.pth')
        # ログを保存
        
        log_epoch = {'epoch': epoch+1, 'train_loss': epoch_train_loss /
                     num_train_imgs, 'val_loss': epoch_val_loss/num_val_imgs}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")
        
    # 最後のネットワークを保存する
    torch.save(net.state_dict(), 'weights/pspnet50_' +
               str(epoch+1) + '.pth')
    return train_loss_list, val_loss_list
```

Answer

> CUDA error: device-side assert triggered
GPUではなくCPUで動かしてみてください。

Answer

> CUDA error: device-side assert triggered

参考
[第３章　2クラス分類の場合のエラー #182](https://github.com/YutaroOgawa/pytorch_advanced/issues/182)

関連した質問