Mixupを使うと学習データへの精度が下がる

実現したいこと

Mixupによるデータ水増しの有効性を調べるためにFashion-Mnistを用いてシミュレーションをしています．

testデータへのAccuracyは殆ど差がないのですが，Mixupを適用しない場合の精度に比べて，MIxup適用後の学習データでのAccuracyが大きく低下してしまい，困っています．
Mixupは性能向上に寄与する手法として提案されたと記憶しているのですが，こういうものなのでしょうか？
もしくは，プログラムのどこかに問題があるのでしょうか？
分かる方いたら教えて頂けるとありがたいです．
・Mixup無し

・Mixup適用後

該当のソースコード

Python
1#  必要ライブラリのロード
2import torch
3import torch.nn as nn
4import torch.nn.functional as F
5import torchvision
6from torchvision import models
7from torchvision.models.feature_extraction import create_feature_extractor
8#from memory_profiler import profile
9#from pytorch_memlab import profile
10import numpy as np
11import matplotlib.pyplot as plt
12from PIL import Image
13from IPython.display import display
14import time
15import pickle
16import pandas as pd
17import seaborn as sns
18from sklearn.datasets import load_iris
19from sklearn.utils import shuffle
20# データセットの読み込み
21from sklearn.model_selection import train_test_split
22
23train_data = torchvision.datasets.FashionMNIST('./fashion-mnist', train=True, download=True, transform=torchvision.transforms.ToTensor())
24test_data = torchvision.datasets.FashionMNIST('./fashion-mnist', train=False, download=True, transform=torchvision.transforms.ToTensor())
25random_state=0 #分割する際のseed
26np.random.seed(5) #Mixupの際のseed
27mix = True #mixupするか
28
29train_data.data,train_data.targets = shuffle(train_data.data,train_data.targets)
30x_train, y_train = train_data.data, train_data.targets
31x_test, y_test = test_data.data, test_data.targets
32
33if torch.cuda.is_available() == True:
34  model = mnist_Net().to('cuda')
35else:
36  model = mnist_Net()
37
38y_train_enc = []
39for i in range(len(y_train)):#Mixupを適用するために教師データをone-hot encodeへ
40  y_train_enc.append([0,0,0,0,0,0,0,0,0,0])
41  y_train_enc[i][y_train[i]] = 1
42y_train_enc = torch.tensor(y_train_enc)
43
44mixup_rate = 0.5 #学習データに対するMixupデータの割合
45mixup_size = int(len(x_train)*mixup_rate)
46mixup_data_x = []
47mixup_data_y = []
48for i in range(mixup_size):
49  index_1 = np.random.randint(0, len(x_train)-1) #x_1,y_1のindex
50  index_2 = np.random.randint(0, len(x_train)-1) #x_2,y_2のindex
51  if index_1 == index_2:#x_1,x_2が同じ時を避ける
52    if index_1 == 0:
53      index_2 += 1
54    else:
55      index_2 -= 1
56  x_1 = x_train[index_1]
57  y_1 = y_train_enc[index_1]
58  x_2 = x_train[index_2]
59  y_2 = y_train_enc[index_2]
60  x,y = mixup(x_1,y_1,x_2,y_2)
61  mixup_data_x.append(x)
62  mixup_data_y.append(y)
63mixup_data_x = torch.stack(mixup_data_x)
64mixup_data_y = torch.stack(mixup_data_y)
65
66x_train,y_train_enc = shuffle(x_train,y_train_enc)
67
68x_train_mix = np.append(mixup_data_x, x_train,axis= 0)
69y_train_mix = np.append(mixup_data_y, y_train_enc,axis= 0)
70
71
72if mix == False:
73  x_train = x_train
74  y_train_enc = y_train_enc
75else:
76  x_train = torch.tensor(x_train_mix)
77  y_train_enc = torch.tensor(y_train_mix)
78
79optimizer = torch.optim.Adam(model.parameters())
80epoch = 150
81batch = 1000
82iteration = int(len(x_train)/batch)
83loss_train = []
84acc_train = []
85loss_test = []
86acc_test = []
87ce = nn.CrossEntropyLoss()
88for i in range(epoch):
89  if i % 10 == 0:
90    print('epoch:', i)
91
92  loss_total_train = 0
93  rmse_train = 0
94  x_train,y_train_enc = shuffle(x_train,y_train_enc)
95  y_output = []
96  y_output_test = []
97  model.train()
98  for j in range(iteration):
99    x_train_batch = x_train[j*batch:(j+1)*batch].to(torch.float32).to('cuda')
100    x_train_batch = torch.unsqueeze(x_train_batch, dim=1)
101    y_train_batch = y_train_enc[j*batch:(j+1)*batch].to(torch.float32).to('cuda') 
102
103    optimizer.zero_grad()
104    y = model(x_train_batch)
105    #y_train_batch = y_train_batch.reshape(batch,1)
106    loss = ce(y, y_train_batch)
107    loss_total_train += loss.to('cpu')
108    loss.backward()
109    optimizer.step()
110
111    y_output.append(torch.argmax(y,dim = 1).to('cpu').detach().numpy())
112  loss_train.append(loss_total_train.detach().numpy()/len(x_train))
113  acc = 0
114  y_output = np.stack(y_output).flatten()
115  for n in range(len(x_train)):
116    if y_output[n] == torch.argmax(y_train_enc[n],dim=0):
117      acc += 1
118  acc_train.append(acc/len(x_train))
119  model.eval()
120  y = model(torch.unsqueeze(x_test.to(torch.float32).to('cuda'),dim=1))
121
122  loss = ce(y, y_test.to(torch.int64).to('cuda'))
123  y_output_test.append(torch.argmax(y,dim = 1).to('cpu').detach().numpy())
124  loss_test.append(loss.to('cpu').detach().numpy())
125  acc = 0
126  for n in range(len(x_test)):
127    if y_output_test[0][n] == y_test[n]:
128      acc += 1
129  acc_test.append(acc/len(x_test))

Python
1def mixup(x_1,y_1,x_2,y_2, alpha = 1):
2  l = np.random.beta(1, 1)
3  x_l = l#.reshape(1, 1, 1)
4  y_l = l#.reshape(1, )
5  mix_x = x_1*x_l + x_2*(1-x_l)
6  mix_y = y_1*y_l + y_2*(1-y_l)
7  return mix_x, mix_y
8
9class mnist_Net(nn.Module):
10  def __init__(self):
11    super(mnist_Net, self).__init__()
12    self.features = nn.Sequential(
13            nn.Conv2d(1, 32, kernel_size=3, padding=1),
14            nn.ReLU(),
15            nn.BatchNorm2d(32),
16            nn.MaxPool2d(kernel_size=2),
17            nn.Conv2d(32, 64, kernel_size=3, padding=1),
18            nn.ReLU(),
19            nn.BatchNorm2d(64),
20            nn.MaxPool2d(kernel_size=2),
21    )
22    self.classifier = nn.Sequential(
23            nn.Dropout(),
24            nn.Linear(64 * 7 * 7, 128),
25            nn.ReLU(),
26            nn.Dropout(),
27            nn.Linear(128, 256),
28            nn.ReLU(),
29            nn.Dropout(),
30            nn.Linear(256,128),
31            nn.ReLU(),
32            nn.Dropout(),
33            nn.Linear(128, 10),
34    )
35  def forward(self, x):
36    x = self.features(x)
37    x = torch.flatten(x, 1)
38    x = self.classifier(x)
39    return x

jbpb0

2023/02/07 10:58

Mixupが作成した画像を、ファイルに保存して確認してみたら、いかがでしょうか現実にはありえないような変な形のものが写ってる画像が大量に作成されてたりするかも

RyosuK.S

2023/02/07 13:58

うーん、普通ですね····· まさに、2枚の画像を足し合わせた様な感じです·····

ps_aux_grep

2023/02/07 14:07 編集

とりあえずデータは正規化されておらず，値域が0~255のままのようですね， testそのままでtrainがおかしいということは，Dropoutしすぎな気がします．デフォルトでDropout1つにつき50%のノードが不活性化するので，それを4回通すとすると単純計算で6.25%のノードの情報しか伝播してません，mixup無しでは1クラスしか指す必要がなかったものの，mixupありでは複数クラスを指す必要がある中で，6.25%の情報では不足していたのでしょうね．Dropout率を16%にすると4回通しても50%の情報を伝搬できるので，train精度が元に戻ると考えます．

jbpb0

2023/02/08 02:26

> 2枚の画像を足し合わせた様な感じ「靴」と「バッグ」が合成された、現実にはありえないような画像で学習しても、と思ったのですが、Mixupってそういうもので、それで効果があるのですね失礼しました

ps_aux_grep

2023/02/08 22:30

てかMixUpによる水増しに関してですが，結局のところ水増しになってませんね，オフラインDAによるMixUpは水増しになりませんので，オンラインDAの検討を願います．同じ条件で実験したところ，オンラインDAにするだけでtestデータへの精度よりもtrainデータの方が良い精度を示しました．