pytorchでのgpu学習処理でエラーをなくしたい
今、深層学習の学習においてパラメータの値を三値にするためのソースコードであるTernary-Weights-Networkのsample学習を行っています。githubからダウンロードしてcpu処理はできたのですが、gpuでの学習ができません。
発生している問題・エラーメッセージ
Traceback (most recent call last): File "main.py", line 149, in <module> main() File "main.py", line 84, in main train(args,epoch_index,train_loader,model,optimizer,criterion) File "main.py", line 108, in train loss.backward() File "/usr/local/lib/python3.5/dist-packages/torch/tensor.py", line 118, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph) File "/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py", line 94, in backward Variable._execution_engine.run_backward( RuntimeError: expected device cuda:0 and dtype Float but got device cpu and dtype Float
該当のソースコード
main.py
python3
1import torch 2import torch.nn as nn 3from torch.autograd import Variable 4import torch.optim as optim 5from torchvision import datasets, transforms 6import argparse 7 8import model as M 9import util as U 10 11def ParseArgs(): 12 parser = argparse.ArgumentParser(description='Ternary-Weights-Network Pytorch MNIST Example.') 13 parser.add_argument('--batch-size',type=int,default=100,metavar='N', 14 help='batch size for training(default: 100)') 15 parser.add_argument('--test-batch-size',type=int,default=100,metavar='N', 16 help='batch size for testing(default: 100)') 17 parser.add_argument('--epochs',type=int,default=100,metavar='N', 18 help='number of epoch to train(default: 100)') 19 parser.add_argument('--lr-epochs',type=int,default=20,metavar='N', 20 help='number of epochs to decay learning rate(default: 20)') 21 parser.add_argument('--lr',type=float,default=1e-3,metavar='LR', 22 help='learning rate(default: 1e-3)') 23 parser.add_argument('--momentum',type=float,default=0.9,metavar='M', 24 help='SGD momentum(default: 0.9)') 25 parser.add_argument('--weight-decay','--wd',type=float,default=1e-5,metavar='WD', 26 help='weight decay(default: 1e-5)') 27 parser.add_argument('--no-cuda',action='store_true',default=False, 28 help='disable CUDA training') 29 parser.add_argument('--seed',type=int,default=1,metavar='S', 30 help='random seed(default: 1)') 31 parser.add_argument('--log-interval',type=int,default=100,metavar='N', 32 help='how many batches to wait before logging training status') 33 34 args = parser.parse_args() 35 args.cuda = not args.no_cuda and torch.cuda.is_available() 36 return args 37 38def main(): 39 args = ParseArgs() 40 if args.cuda: 41 torch.cuda.manual_seed(args.seed) 42 kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 43 44 BATCH_SIZE = args.batch_size 45 TEST_BATCH_SIZE = args.test_batch_size 46 learning_rate = args.lr 47 momentum = args.momentum 48 weight_decay = args.weight_decay 49 50 ################################################################### 51 ## Load Train Dataset ## 52 ################################################################### 53 train_loader = torch.utils.data.DataLoader( 54 datasets.MNIST('./mnist_data', train=True, download=False, 55 transform=transforms.Compose([ 56 transforms.ToTensor(), 57 transforms.Normalize((0.1307,), (0.3081,)) 58 ])), 59 batch_size=BATCH_SIZE, shuffle=True,**kwargs) 60 ################################################################### 61 ## Load Test Dataset ## 62 ################################################################### 63 test_loader = torch.utils.data.DataLoader( 64 datasets.MNIST('./mnist_data', train=False, download=False, 65 transform=transforms.Compose([ 66 transforms.ToTensor(), 67 transforms.Normalize((0.1307,), (0.3081,)) 68 ])), 69 batch_size=TEST_BATCH_SIZE, shuffle=True,**kwargs) 70 71 model = M.LeNet5_T() 72 print("Using cuda is :", args.cuda) 73 if args.cuda: 74 model.cuda() 75 criterion = nn.CrossEntropyLoss() 76 if args.cuda: 77 criterion.cuda() 78 #optimizer = optim.SGD(model.parameters(),lr=learning_rate,momentum=momentum) 79 optimizer = optim.Adam(model.parameters(),lr=learning_rate,weight_decay=weight_decay) 80 81 best_acc = 0.0 82 for epoch_index in range(1,args.epochs+1): 83 adjust_learning_rate(learning_rate,optimizer,epoch_index,args.lr_epochs) 84 train(args,epoch_index,train_loader,model,optimizer,criterion) 85 acc = test(args,model,test_loader,criterion) 86 if acc > best_acc: 87 best_acc = acc 88 U.save_model(model,best_acc) 89 90def train(args,epoch_index,train_loader,model,optimizer,criterion): 91 model.train() 92 for batch_idx,(data,target) in enumerate(train_loader): 93 if args.cuda: 94 data,target = data.cuda(),target.cuda() 95 model.cuda() 96 data,target = Variable(data),Variable(target) 97 98 show = False 99 if show: 100 print('data : {}'. format(data)) 101 print('target : {}'. format(target)) 102 103 import pdb;pdb.set_trace() 104 optimizer.zero_grad() 105 106 output = model(data) 107 loss = criterion(output,target) 108 loss.backward() 109 110 optimizer.step() 111 112 if batch_idx % args.log_interval == 0: 113 print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 114 epoch_index, batch_idx * len(data), len(train_loader.dataset), 115 100. * batch_idx / len(train_loader), loss.data)) 116 117 118def test(args,model,test_loader,criterion): 119 ~ 省略 ~ 120 121def adjust_learning_rate(learning_rate,optimizer,epoch_index,lr_epoch): 122 lr = learning_rate * (0.1 ** (epoch_index // lr_epoch)) 123 for param_group in optimizer.param_groups: 124 param_group['lr'] = lr 125 return lr 126 127if __name__ == '__main__': 128 main()
model.py
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sat Apr 28 13:34:27 2018 @author: bai """ import torch import torch.nn as nn import torch.nn.functional as F i=0 def Ternarize(tensor): output = torch.zeros(tensor.size()) delta = Delta(tensor) alpha = Alpha(tensor,delta) for i in range(tensor.size()[0]): for w in tensor[i].view(1,-1): pos_one = (w > delta[i]).type(torch.FloatTensor) neg_one = -1 * (w < -delta[i]).type(torch.FloatTensor) out = torch.add(pos_one,neg_one).view(tensor.size()[1:]) device_cpu = torch.device('cpu') #alpha[i].to(device_cpu) #import pdb;pdb.set_trace() output[i] = torch.add(output[i],torch.mul(out,alpha[i].to(device_cpu))) return output def Alpha(tensor,delta): Alpha = [] for i in range(tensor.size()[0]): count = 0 abssum = 0 absvalue = tensor[i].view(1,-1).abs() for w in absvalue: truth_value = w > delta[i] #print to see count = truth_value.sum() i+=1 print(i) abssum = torch.matmul(absvalue,truth_value.type(torch.FloatTensor).view(-1,1).cuda()) Alpha.append(abssum/count) alpha = Alpha[0] for i in range(len(Alpha) - 1): alpha = torch.cat((alpha,Alpha[i+1])) return alpha def Delta(tensor): n = tensor[0].nelement() if(len(tensor.size()) == 4): #convolution layer delta = 0.7 * tensor.norm(1,3).sum(2).sum(1).div(n) elif(len(tensor.size()) == 2): #fc layer delta = 0.7 * tensor.norm(1,1).div(n) return delta class TernaryLinear(nn.Linear): def __init__(self,*args,**kwargs): super(TernaryLinear,self).__init__(*args,**kwargs) def forward(self,input): self.weight.data = Ternarize(self.weight.data) out = F.linear(input,self.weight,self.bias) return out class TernaryConv2d(nn.Conv2d): def __init__(self,*args,**kwargs): super(TernaryConv2d,self).__init__(*args,**kwargs) def forward(self,input): self.weight.data = Ternarize(self.weight.data) out = F.conv2d(input, self.weight.cuda(), self.bias, self.stride,self.padding, self.dilation, self.groups) return out class LeNet5_T(nn.Module): def __init__(self): super(LeNet5_T,self).__init__() self.conv1 = TernaryConv2d(1,32,kernel_size = 5) self.bn_conv1 = nn.BatchNorm2d(32) self.conv2 = TernaryConv2d(32,64,kernel_size = 5) self.bn_conv2 = nn.BatchNorm2d(64) self.fc1 = TernaryLinear(1024,512) self.fc2 = TernaryLinear(512,10) def forward(self,x): x = self.conv1(x) x = F.relu(F.max_pool2d(self.bn_conv1(x),2)) x = self.conv2(x) x = F.relu(F.max_pool2d(self.bn_conv2(x),2)) x = x.view(-1,1024) x = F.relu(self.fc1(x)) x = self.fc2(x) return x
githubに落ちているソースコードを多少変えています。
わかっていること
epoch1回目はなぜかできる。
2回目から上記errorが出る。
loss = criterion(output, target)のoutputとtargetはcudaテンソルであると確認しました。
補足情報(FW/ツールのバージョンなど)
chainer v6.0.0
| NVIDIA-SMI 418.56 Driver Version: 418.56 CUDA Version: 10.1
参考
回答1件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。