Pytorchの学習が段々遅くなってしまう

前提

Chainerで書いていたコードをPytorchで書き直して実行していたところ学習がだんだんと遅くなってしまう現象に遭遇しました．
最初は1エポック5分程度だったのに，50エポックあたりで30分以上かかる様になってしまいました．
Chainerではその様なことはなかったので，ChainerとPytorchの違いで私が理解していない部分があるのではと思います．
2つの実行時の様子を見ていたところPytorchでは学習時間が遅くなるにつれ，CPUメモリが圧迫されている様でした．最初は5GiBほどだったのに，いつの間にか20GiBまで膨れ上がり，メモリの殆どを使用していました．(Chainerでは常に5GiB程で一定でした)

ここに問題の解決策があるのでは，と考えているのですが，コード上のどこが問題なのかわからないため教えていただきたいです．
エポック数が進む毎にCPUメモリを食っているので学習ループの部分に問題があるのでは，と考えています
(main.pyの★からの部分です)が，どこに問題があるのか分かりません・・・
私が考えている部分以外にもコード上に問題があればぜひコメントお願いいたします．

Python
1#main.py データセットの事前処理の部分などは省いています
2def main(param, out_dir):
3'''
4一部省略
5'''
6    param.savetxt_parameter(out_dir)
7    file_log = open(out_dir + "log.txt", "w")
8
9    accuracy_plot = [[],[],[]]
10    loss_plot = [[],[],[]]
11    score_plot = [[],[],[]]
12    score_jma_plot = [[],[],[]]
13
14    max_accuracy = 0
15    min_loss = 100
16
17    best_accuracy = np.array([0, 0])
18    best_loss = np.array([100, 100])
19    best_score = np.array([0, 0])
20    best_score_jma = np.array([0, 0])
21
22    accuracy = np.array([0, 0, 0])
23    score = np.array([0, 0, 0])
24    score_jma = np.array([0, 0, 0])
25    loss_cal = np.array([0, 0, 0])
26
27    count_save_model = 0
28
29    #★ 学習開始
30    #Loop epochs
31    for epoch in range(param.max_epoch):
32        #データのシャッフル
33        dataset = permutation_nclass(sequence)
34        dataset_batch = ["","",""]
35
36        loss_model = 0
37        loss_sum = np.array([0,0,0])
38        count_loss = 0
39
40        y_save = [np.array([]),np.array([]),np.array([])]
41        t_save = [np.array([]),np.array([]),np.array([])]
42
43        #Loop batches
44        for i in range(3):
45            for batch in range(ITERATIONS):
46                if i==0: #学習
47                    batch_size = param.batch_size
48                else: #検証・テスト
49                    batch_size = batchsize_test
50
51                dataset_batch[i] = dataset[i][batch*batch_size:(batch+1)*batch_size]
52
53                if i==0:
54                    dataset_batch[0] = dataset[0][batch*param.batch_size:(batch+1)*param.batch_size]
55                else:
56                    dataset_batch[i] = dataset[i][batch*batchsize_test:(batch+1)*batchsize_test]
57                
58                 seq_loop = param.sequence_size
59                    if i == 0:
60                        loss_calculator.model.reset_lstm(32)
61                    else:
62                        loss_calculator.model.reset_lstm(4)
63
64                for seq in range(seq_loop):
65                    dataset_seq_ = dataset_batch[i][:,seq] #シーケンスから1つ抜き出し (batch, datas)
66
67                    values = dataset_seq_[:,:len(param.use_value)].astype(dtype=np.float32) #数値データ抜き出し
68                    t = dataset_seq_[:,len(param.use_value)].astype(dtype=np.float32)  # Loss計算用t (教師データ)
69                    t_not_norm = dataset_seq_[:,len(param.use_value)+1].astype(dtype=np.float32)  # 評価用t 
70                    t = t[:,np.newaxis]
71                    
72                    if seq == seq_loop-1:
73                        t_save[i] = np.concatenate((t_save[i], t_not_norm))
74
75                    images = dataset_seq_[:, -1] #画像名抽出
76                    images = np.array(module.data_file.load_image(images, picsize).astype(dtype=np.float32))
77                    dataset_input = (values, images, t) #数値データ，画像データ，教師データ
78
79                    if i==0:
80                        loss, y = loss_calculator(dataset_input[0],dataset_input[1], dataset_input[2])
81                        loss_model += loss #メモリ不足回避
82                        
83                    else:
84                        loss, y = loss_calculator(dataset_input[0],dataset_input[1], dataset_input[2], train=False)#train=Falseで推論モード
85
86                     y = y.flatten()
87
88                    if seq == seq_loop-1:
89                        y_save[i] = np.concatenate((y_save[i],y))
90
91                    count_loss += 1
92
93
94                loss_calculator.zero_grad() 
95                if i==0:
96                    loss_model.backward(retain_graph=True)
97                    del loss_model
98                    torch.cuda.empty_cache() 
99　　　　　　  optimizer.step() 
100
101        #評価
102        elapsed = datetime.datetime.now() - st
103        elapsed_per_epoch = datetime.datetime.now() - now
104        now = datetime.datetime.now()
105
106        module.output.print_file(["  EPOCH:", epoch+1, "/ TIME:", elapsed, now], file_log)
107        module.output.print_file(["  Next epoch will be finished at", now+elapsed_per_epoch], file_log)
108        Label = ["Train", "Validation", "Test"]
109
110
111        save_model = False
112        for i in range(3):
113            accuracy, score, score_jma, accuracy_table, loss_cal = module.module_evaluate.evaluate_v2(y_save[i], t_save[i], param)
114            module.output.print_file([Label[i], " / Loss:", loss_cal, "/ Accuracy:", accuracy,"/ Score:", score,"/ Score JMA:",score_jma,"/ Accuracy table is shown..."], file_log)
115            module.output.print_file([accuracy_table], file_log)
116            accuracy_plot[i].append(accuracy)
117            loss_plot[i].append(loss_cal)
118            score_plot[i].append(score)
119            score_jma_plot[i].append(score_jma)
120
121        module.output.draw_train_graph(out_dir+"accuracy.png","accuracy",accuracy_plot[0],accuracy_plot[1],accuracy_plot[2])	#12/17　l386~389に~plot[2]追加
122        module.output.draw_train_graph(out_dir+"loss.png","loss",loss_plot[0],loss_plot[1],loss_plot[2])
123        module.output.draw_train_graph(out_dir+"score.png", "score", score_plot[0], score_plot[1],score_plot[2])
124        module.output.draw_train_graph(out_dir+"score_jma.png", "score_jma", score_jma_plot[0], score_jma_plot[1],score_jma_plot[2])
125
126        graph_data = [accuracy_plot, loss_plot, score_plot, score_jma_plot]
127        np.save(out_dir+"graph_plot_data", graph_data)
128
129    print("training finish")
130    file = open(out_dir+"finished.txt", "w")
131    file.close()
132
133if __name__ == "__main__":
134    param = parameter(ResNetLSTM_model_id) #constant.pyから読み込み
135    param.docker = False
136    param.model_type = Regression
137    param.forecast_time = 120
138    param.max_epoch = 150
139    param.r_max = 20
140    param.sequence_size = 10
141    param.time_sequence = 30
142    param.panorama = True
143    param.log_norm = False
144    param.max_dataset =5000
145    param.data_rate = [8,1,1]
146    param.num_of_directories = 5
147    param.adjust_all = False
148    #格納ファイルづくり
149    now = datetime.datetime.now()
150    now = 1
151    out_dir = "result/output/"
152    out_dir += str(now) + "/"
153    main(param,out_dir)

また，main.pyの中で利用している学習モデルは以下になります

Python
1#ResNETLSTM.py
2Classification = True
3Regression = False
4
5class ResNetLSTM(nn.Module):
6    def __init__(self, output = 2, model_type=Classification):
7        super(ResNetLSTM, self).__init__()
8        if not model_type:
9            output = 1
10        self.base = models.resnet152(pretrained=True)
11        self.fc = nn.Linear(2048, 1)
12
13        self.lstm_in = nn.Linear(7, 20)
14        self.lstm = nn.LSTMCell(20, 20)
15        self.lstm_out = nn.Linear(20, output)
16
17        self.model_type = model_type
18        
19    def _init_weights(self, module):#重みの初期化
20        if isinstance(module, nn.Linear):
21            module.weight.data.normal_(mean=0.0, std=0.01)
22            if module.bias is not None:
23                module.bias.data.zero_()
24        elif isinstance(module, nn.LSTMCell):
25            module.weight.data.normal_(mean=0.0, std=0.01)
26            if module.bias is not None:
27                module.bias.data.zero_()
28
29    
30    def __call__(self, x, x_pic,train):
31        if x.ndim == 1:
32            x = x[:, np.newaxis]
33        if train:
34            batch = 32
35        else:
36            batch = 4
37
38        x_pic = torch.tensor(x_pic).to('cuda') #(32, 3, 33, 220)
39        x = torch.tensor(x).to('cuda') #(32, 6)
40        feature_extractor = create_feature_extractor(self.base, {"avgpool":"feature"}) #(32, 2048)
41        h = feature_extractor(x_pic)["feature"]
42        h = self.fc(h.reshape(batch,2048)) #(32, 1)
43        cnn_out = torch.sigmoid(h) #(32, 1) 
44
45        h = torch.cat([cnn_out, x],dim=1)
46        h = self.lstm_in(h)
47        h,_ = self.lstm(h)
48        h = self.lstm_out(h)
49        return h
50
51    def reset_lstm(self,batch):
52        if torch.cuda.is_available() == True:
53            self.h0 = torch.zeros(batch,20).cuda()
54            self.c0 = torch.zeros(batch,20).cuda()
55        else:
56            self.h0 = torch.zeros(batch,20)
57            self.c0 = torch.zeros(batch,20)
58
59    def zerograd(self): #使ってないhttps://studylog.hateblo.jp/entry/2016/09/24/131954参照。cleargrad使ってるみたい
60        self.zero_grad()
61
62class LossCalculator(nn.Module):
63    def __init__(self, model):
64        super(LossCalculator, self).__init__()
65        self.model = model
66
67    def __call__(self, x, x_pic, t, train=True):
68        with torch.set_grad_enabled(train):
69            t = torch.tensor(t).to('cuda')
70            y = self.model(x, x_pic,train)
71
72            if self.model.model_type:
73                loss = F.cross_entropy(y,t) #クラス分類用誤差関数(output>=2)
74            else:
75                loss = F.mse_loss(y, t) #回帰用誤差関数(output=1)
76        if train:
77            return loss, y.to('cpu').detach().numpy().copy()
78        else:
79            return loss, y.to('cpu').detach().numpy().copy()
80
81class predictor(chainer.Chain): #使ってないかも
82    def __init__(self, model):
83        super(predictor, self).__init__()
84        self.model = model
85
86    def __call__(self, x, x_pic):
87        with torch.set_grad_enabled(train):
88            y = self.model(x, x_pic)
89        return y.detach().numpy().copy()

補足

プログラムを一部省略していますので，この変数や関数何？というのがあればコメントいただけると助かります．

行動規範の内容に同意します

回答1件

自己解決

Class ResNetLSTM内の
feature_extractor = create_feature_extractor(self.base, {"avgpool":"feature"})
が原因でした．これがループ処理ごとに新たなモデルを生成し，かつ，前に生成したモデルとは別の領域のメモリを使っていたためにメモリが徐々に占領されていった様です．
feature_extractor = create_feature_extractor(self.base, {"avgpool":"feature"})を
def init(self, output = 2, model_type=Classification):内に書くことで解決しました．

投稿2022/10/28 22:24

RyosuK.S

総合スコア45