SSDでマルチGPUで学習しようとするとき、ギャザー関数がCPUテンソルに含まれていないとエラーが表示される

###前提・実現したいこと
PyTorchによるSSDを用いた物体検出の訓練について
「作りながら学ぶ　PyTorchによる発展ディープラーニング」
の第2章　「2-7_SSD_training.ipynb」
において net = nn.DataParallel(net)を追加してマルチGPUで学習を実行した際に以下のエラーメッセージが表示されました。

発生している問題・エラーメッセージ

AssertionError                            Traceback (most recent call last)
<ipython-input-34-56fa4f8d86af> in <module>
      1 # 学習・検証を実行する
      2 num_epochs= 10
----> 3 train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

<ipython-input-33-645d91cb3a1e> in train_model(net, dataloaders_dict, criterion, optimizer, num_epochs)
     60                 with torch.set_grad_enabled(phase == 'train'):
     61                     # 順伝搬（forward）計算
---> 62                     outputs = net(images)
     63 
     64                     # 損失の計算

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    725             result = self._slow_forward(*input, **kwargs)
    726         else:
--> 727             result = self.forward(*input, **kwargs)
    728         for hook in itertools.chain(
    729                 _global_forward_hooks.values(),

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
    160         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
    161         outputs = self.parallel_apply(replicas, inputs, kwargs)
--> 162         return self.gather(outputs, self.output_device)
    163 
    164     def replicate(self, module, device_ids):

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in gather(self, outputs, output_device)
    172 
    173     def gather(self, outputs, output_device):
--> 174         return gather(outputs, output_device, dim=self.dim)
    175 
    176 

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in gather(outputs, target_device, dim)
     66     # Setting the function to None clears the refcycle.
     67     try:
---> 68         res = gather_map(outputs)
     69     finally:
     70         gather_map = None

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in gather_map(outputs)
     61             return type(out)(((k, gather_map([d[k] for d in outputs]))
     62                               for k in out))
---> 63         return type(out)(map(gather_map, zip(*outputs)))
     64 
     65     # Recursive function calls like this create reference cycles.

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/scatter_gather.py in gather_map(outputs)
     53         out = outputs[0]
     54         if isinstance(out, torch.Tensor):
---> 55             return Gather.apply(target_device, dim, *outputs)
     56         if out is None:
     57             return None

~/anaconda3/envs/vgg/lib/python3.6/site-packages/torch/nn/parallel/_functions.py in forward(ctx, target_device, dim, *inputs)
     54     def forward(ctx, target_device, dim, *inputs):
     55         assert all(map(lambda i: i.device.type != 'cpu', inputs)), (
---> 56             'Gather function not implemented for CPU tensors'
     57         )
     58         target_device = _get_device_index(target_device, True)

AssertionError: Gather function not implemented for CPU tensors

該当のソースコード

python
1# モデルを学習させる関数を作成
2
3
4def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):
5
6    # GPUが使えるかを確認
7    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
8    print("使用デバイス：", device)
9    net = nn.DataParallel(net)
10
11    # ネットワークをGPUへ
12    net.to(device)
13
14    # ネットワークがある程度固定であれば、高速化させる
15    torch.backends.cudnn.benchmark = True
16
17    # イテレーションカウンタをセット
18    iteration = 1
19    epoch_train_loss = 0.0  # epochの損失和
20    epoch_val_loss = 0.0  # epochの損失和
21    logs = []
22
23    # epochのループ
24    for epoch in range(num_epochs+1):
25
26        # 開始時刻を保存
27        t_epoch_start = time.time()
28        t_iter_start = time.time()
29
30        print('-------------')
31        print('Epoch {}/{}'.format(epoch+1, num_epochs))
32        print('-------------')
33
34        # epochごとの訓練と検証のループ
35        for phase in ['train', 'val']:
36            if phase == 'train':
37                net.train()  # モデルを訓練モードに
38                print('（train）')
39            else:
40                if((epoch+1) % 10 == 0):
41                    net.eval()   # モデルを検証モードに
42                    print('-------------')
43                    print('（val）')
44                else:
45                    # 検証は10回に1回だけ行う
46                    continue
47
48            # データローダーからminibatchずつ取り出すループ
49            for images, targets in dataloaders_dict[phase]:
50
51                # GPUが使えるならGPUにデータを送る
52                images = images.to(device)
53                targets = [ann.to(device)
54                           for ann in targets]  # リストの各要素のテンソルをGPUへ
55
56                # optimizerを初期化
57                optimizer.zero_grad()
58
59                # 順伝搬（forward）計算
60                with torch.set_grad_enabled(phase == 'train'):
61                    # 順伝搬（forward）計算
62                    outputs = net(images)
63
64                    # 損失の計算
65                    loss_l, loss_c = criterion(outputs, targets)
66                    loss = loss_l + loss_c
67
68                    # 訓練時はバックプロパゲーション
69                    if phase == 'train':
70                        loss.backward()  # 勾配の計算
71
72                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
73                        nn.utils.clip_grad_value_(
74                            net.parameters(), clip_value=2.0)
75
76                        optimizer.step()  # パラメータ更新
77
78                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
79                            t_iter_finish = time.time()
80                            duration = t_iter_finish - t_iter_start
81                            print('イテレーション {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
82                                iteration, loss.item(), duration))
83                            t_iter_start = time.time()
84
85                        epoch_train_loss += loss.item()
86                        iteration += 1
87
88                    # 検証時
89                    else:
90                        epoch_val_loss += loss.item()
91
92        # epochのphaseごとのloss （Issue158での誤植修正）
93        t_epoch_finish = time.time()
94        print('-------------')
95        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
96            epoch+1, epoch_train_loss, epoch_val_loss))
97        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
98        t_epoch_start = time.time()
99
100        # ログを保存
101        log_epoch = {'epoch': epoch+1,
102                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
103        logs.append(log_epoch)
104        df = pd.DataFrame(logs)
105        df.to_csv("log_output.csv")
106
107        epoch_train_loss = 0.0  # epochの損失和
108        epoch_val_loss = 0.0  # epochの損失和
109
110        # ネットワークを保存する
111        if ((epoch+1) % 10 == 0):
112            torch.save(net.state_dict(), 'weights/ssd300_' +
113                       str(epoch+1) + '.pth')
114
115
116# 学習・検証を実行する
117num_epochs= 10
118train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)
119
120