pytorch-unet3Dを実行して、ちゃんと動いているか？

下記のコードを実行すると、
＄$ python main.py
Number of training patches per epoch: 16
Number of validation patches per epoch: 16
Number of testing patches per epoch: 16

と出てきます。[@pc33]は出てこず、機械学習できているように感じるのですが、１日経ってもこの状態なので、学習できているか心配です。
＄nividia-smi
でGPUを確認すると、ちゃんと動いているようでした。
無限ループに陥っているとかですか？それともちゃんと機械学習できていますか？
print文で確認したら、
if args.multi_gpu is True:
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id # Multi-gpu selector for training
net = torch.nn.DataParallel((UNet(residual='pool')).cuda()) # load the network Unet
の手前まで実行できていました。
しかし、このif,elseはどちらも実行されていません。
args.gpu_idに問題があるのかと思うのですが、どう直せばいいですか？
使っているGPUは１つ。defaultのままなので、ID=0だと思います。

python
1import time
2import os
3import torch
4from predict import *
5from torch.utils.data import DataLoader
6from torch.autograd import Variable
7from utils import AvgMeter, dice_coeff, check_dir, DiceLoss, BinaryDiceLoss
8from init import InitParser
9from NiftiDataset import *
10import NiftiDataset as NiftiDataset
11from UNet import UNet
12
13
14def test_epoch(net, loader):
15    # we transfer the mode of network to test
16    net.eval()
17    test_dice_meter = AvgMeter()
18    for batch_idx, (data, label) in enumerate(loader):
19        data = Variable(data.cuda())
20        output = net(data)
21
22        output = output.squeeze().data.cpu().numpy()
23        label = label.squeeze().cpu().numpy()
24
25        test_dice_meter.update(dice_coeff(output, label))
26
27        # print("Test {} || Dice: {:.4f}".format(str(batch_idx).zfill(4), test_dice_meter.val))
28    return test_dice_meter.avg
29
30
31def train_epoch(net, loader, optimizer, cost):
32    # we transfer the mode of network to train
33    net.train()
34
35    batch_loss = AvgMeter()
36    for batch_idx, (data, label) in enumerate(loader):
37        data = Variable(data.cuda())                                                       # A Variable wraps a Tensor. It supports nearly all the API’s defined by a Tensor.
38        label = Variable(label.cuda())
39
40        output = net(data)                                                                 # Give the data to the network
41
42        loss = cost(output, label)
43        # evaluate the cost function
44        output = output.squeeze().data.cpu().numpy()
45        label = label.squeeze().cpu().numpy()
46        dice = dice_coeff(output, label)
47
48        optimizer.zero_grad()                                                              # we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes
49        loss.backward()
50        optimizer.step()
51
52        batch_loss.update(loss.item())
53        if batch_idx % 10 == 0:
54            print("Train Batch {} || Loss: {:.4f} | Training Dice: {:.4f}".format(str(batch_idx).zfill(4), batch_loss.val, dice))
55    return batch_loss.avg
56
57
58def main(args):
59    ckpt_path = os.path.join(args.output_path, "Checkpoint")
60    log_path = os.path.join(args.output_path, "Log")
61
62    min_pixel = int(args.min_pixel * ((args.patch_size[0] * args.patch_size[1] * args.patch_size[2]) / 100))
63
64    check_dir(args.output_path)
65    check_dir(log_path)
66    check_dir(ckpt_path)
67
68    if args.do_you_wanna_train is True:
69
70        train_list = create_list(args.data_path)
71        val_list = create_list(args.val_path)
72        test_list = create_list(args.test_path)
73
74        for i in range(args.increase_factor_data):                                                # augment the data list for training
75
76            train_list.extend(train_list)
77            val_list.extend(val_list)
78            test_list.extend(test_list)
79
80        print('Number of training patches per epoch:', len(train_list))
81        print('Number of validation patches per epoch:', len(val_list))
82        print('Number of testing patches per epoch:', len(test_list))
83
84        trainTransforms = [
85            NiftiDataset.Resample(args.new_resolution, args.resample),
86            NiftiDataset.Augmentation(),
87            NiftiDataset.Padding((args.patch_size[0], args.patch_size[1], args.patch_size[2])),
88            NiftiDataset.RandomCrop((args.patch_size[0], args.patch_size[1], args.patch_size[2]), args.drop_ratio,
89                                    min_pixel),
90        ]
91
92        valTransforms = [
93            NiftiDataset.Resample(args.new_resolution, args.resample),
94            NiftiDataset.Padding((args.patch_size[0], args.patch_size[1], args.patch_size[2])),
95            NiftiDataset.RandomCrop((args.patch_size[0], args.patch_size[1], args.patch_size[2]), args.drop_ratio,
96                                    min_pixel),
97        ]
98
99        # define the dataset and loader
100        train_set = NifitDataSet(train_list, transforms=trainTransforms, train=True)
101        val_set = NifitDataSet(val_list, transforms=valTransforms, test=True)
102        test_set = NifitDataSet(test_list, transforms=valTransforms, test=True)
103
104        train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True)                                   # Here are then fed to the network with a defined batch size
105        val_loader = DataLoader(val_set, batch_size=args.batch_size, shuffle=False)
106        test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False)
107
108        # define the network and load the init weight
109        if args.multi_gpu is True:
110            os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id  # Multi-gpu selector for training
111            net = torch.nn.DataParallel((UNet(residual='pool')).cuda())  # load the network Unet
112
113        else:
114            torch.cuda.set_device(args.gpu_id)
115            net = UNet(residual='pool').cuda()
116
117        if args.do_you_wanna_load_weights is True:
118            net.load_state_dict(torch.load(args.load_path))                                    # load the weights of the network if you have it
119
120        # define the optimizer of the training process                                         # define the optimizer of the training process
121        optimizer = torch.optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.weight_decay)
122
123        # define the loss function
124        cost = BinaryDiceLoss()                                                              # define the loss function
125        best_dice = 0.
126        for epoch in range(args.init_epoch, args.init_epoch+args.num_epoch):                   # define the epochs number
127            start_time = time.time()
128            # train one epoch
129            epoch_loss = train_epoch(net, train_loader, optimizer, cost)                       # training function
130            # eval in test data after one epoch training
131
132            # epoch_dice_training = test_epoch(net, train_loader)
133            epoch_dice_val = test_epoch(net, val_loader)
134            epoch_dice_test = test_epoch(net, test_loader)
135
136            epoch_time = time.time() - start_time
137
138            info_line = "Epoch {} || Loss: {:.4f} | Time(min): {:.2f} |Validation Dice: {:.4f} | Testing Dice: {:.4f}".format(
139                str(epoch).zfill(3), epoch_loss, epoch_time/60, epoch_dice_val, epoch_dice_test
140            )
141            print(info_line)
142            open(os.path.join(log_path, 'train_log.txt'), 'a').write(info_line+'\n')
143
144            # save the checkpoint
145            if epoch % 10 == 0:
146                torch.save(net.state_dict(), os.path.join(ckpt_path, "Network_{}.pth.gz".format(epoch)))
147            if epoch_dice_val > best_dice:
148                best_dice = epoch_dice_val
149                torch.save(net.state_dict(), os.path.join(ckpt_path, "Best_Dice.pth.gz"))
150
151    if args.do_you_wanna_check_accuracy is True:
152
153        if args.multi_gpu is True:
154            os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id  # Multi-gpu selector for training
155            net = torch.nn.DataParallel((UNet(residual='pool')).cuda())  # load the network Unet
156
157        else:
158            torch.cuda.set_device(args.gpu_id)
159            net = UNet(residual='pool').cuda()
160
161        net.load_state_dict(torch.load('./History/Checkpoint/Best_Dice.pth.gz'))
162
163        train_list = create_list(args.data_path)
164        val_list = create_list(args.val_path)
165        test_list = create_list(args.test_path)
166
167        print("Checking accuracy on validation set")
168        Dice_val = check_accuracy_model(net, val_list, args.resample, args.new_resolution, args.patch_size[0],
169                                        args.patch_size[1], args.patch_size[2],
170                                        args.stride_inplane, args.stride_layer)
171
172        print("Checking accuracy on testing set")
173        Dice_test = check_accuracy_model(net, test_list, args.resample, args.new_resolution, args.patch_size[0],
174                                        args.patch_size[1], args.patch_size[2],
175                                        args.stride_inplane, args.stride_layer)
176
177        print("Checking accuracy on training set")
178        Dice_train = check_accuracy_model(net, train_list, args.resample, args.new_resolution, args.patch_size[0],
179                                        args.patch_size[1], args.patch_size[2],
180                                        args.stride_inplane, args.stride_layer)
181
182        print("Dice_val:",Dice_val,"Dice_test:",Dice_test,"Dice_train:",Dice_train)
183
184
185if __name__ == '__main__':
186    parsers = InitParser()
187    main(parsers)
188

jbpb0

2021/11/05 23:39

> from init import InitParser でインポートされた「InitParser」で > def main(args): の「args」が決まってます > args.gpu_idに問題があるのかと思うのですが、どう直せばいいですか？「InitParser」のコードを、質問に追記してください

jbpb0

2021/11/06 00:01

> for epoch in range(… のループに入って学習が始まったら、10batch毎に > print("Train Batch… で表示され、各epochで > print(info_line) で表示されるはずなので、もし正常に学習されてて > １日経ってもこの状態なら、1日経っても10batchも進んでないことになると思います > 学習できているか心配・一つのデータで推論するのにかかる時間・10batchのデータ数を掛け算して、それが1日以上かかりそうか、確認してみたらいいと思います