CIFER-10 をCNN で学習させたいが、層を増やしたときに収束しない

CIFER-10の画像分類を学習するCNNをpytorchで組んだのですが、うまくいきません。
調べてもよくわからなかったので、だめなところ指摘してもらえるとうれしいです。

やりたいこと

CIFER-10 の画像分類を浅い CNN に学習させたところ、accuracy が73%程度であった。
層を増やしてより高い accuracy を得たい。

浅い CNN のコード

モジュールのインポートとデータローダの作成

python
1
2import torch
3device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
4print(device)
5
6import torchvision
7import torchvision.transforms as transforms
8import torch.nn as nn
9import torch.nn.functional as F
10import torch.optim as optim
11% matplotlib inline
12import matplotlib.pyplot as plt
13import numpy as np
14
15
16batch_size = 256
17loss_interval = 50
18
19
20transform_aug = transforms.Compose(
21    [transforms.RandomHorizontalFlip(p=0.5),
22     transforms.RandomAffine(degrees=0.2, scale=(0.8,1.2)),
23     transforms.ToTensor(),
24     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
25     ])
26 
27trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
28                                        download=True, transform=transform_aug)
29trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
30                                          shuffle=True, num_workers=2)
31 
32
33testset = torchvision.datasets.CIFAR10(root='./data', train=False,
34                                       download=True, transform=transform_aug)
35testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
36                                         shuffle=True, num_workers=2)

モデルの定義と訓練、評価をする関数の定義

python
1
2#ネットワークの定義
3class Net(nn.Module):
4    def __init__(self):
5        super(Net, self).__init__()
6        self.conv1 = nn.Conv2d(3, 64, 3, padding = 1)
7        self.conv7 = nn.Conv2d(64, 64, 3, padding = 1)
8        self.conv2 = nn.Conv2d(64, 128, 3, padding = 1)
9        self.conv6 = nn.Conv2d(128, 128, 3, padding = 1)
10        self.conv3 = nn.Conv2d(128, 256, 3, padding = 1)
11        self.conv5 = nn.Conv2d(256, 256, 3, padding = 1)
12        self.conv4 = nn.Conv2d(256, 16, 3, padding = 1)
13        self.pool = nn.MaxPool2d(2)
14        self.avgpool = nn.AvgPool2d(2)
15        self.fc1 = nn.Linear(16 * 8 * 8, 120)
16        self.fc2 = nn.Linear(120, 84)
17        self.fc3 = nn.Linear(84, 10)
18        self.sm = nn.Softmax(1)
19
20        # dropoutの定義
21        self.dropout1 = nn.Dropout2d(p=0.2)
22        self.dropout2 = nn.Dropout2d(p=0.25)
23        self.dropout3 = nn.Dropout(p=0.3)
24        self.dropout4 = nn.Dropout(p=0.35)
25
26 
27    def forward(self, x):
28        x = F.relu(self.conv1(x))
29        x = F.relu(self.conv7(x))
30        x = F.relu(self.conv2(x))
31        # x = F.relu(self.conv6(x))
32        x = self.pool(x)
33        x = self.dropout1(x)
34        x = F.relu(self.conv3(x))
35        # x = F.relu(self.conv5(x))
36        x = F.relu(self.conv4(x))
37        x = self.pool(x)
38        x = self.dropout2(x)
39        x = x.view(-1, 16 * 8 * 8)
40        x = F.relu(self.fc1(x))
41        x = self.dropout3(x)
42        x = F.relu(self.fc2(x))
43        x = self.dropout4(x)
44        x = self.fc3(x)
45        return x
46
47
48#ネットワークを訓練する関数
49def train(net, criterion, optimizer, n_epoch = 15):
50    # Batch normalization 
51    net.train()
52    train_loss = []
53    test_loss = []
54    for epoch in range(n_epoch):
55        for i, data in enumerate(trainloader, 0):
56            inputs, labels = data[0].to(device), data[1].to(device)
57
58            optimizer.zero_grad()
59
60            outputs = net(inputs)
61            loss = criterion(outputs, labels)
62            loss.backward()
63            optimizer.step()
64
65            if i % loss_interval == (loss_interval - 1):
66                train_loss.append(loss.item())
67
68                with torch.no_grad(): 
69                    data = iter(testloader).next()
70                    inputs, labels = data[0].to(device), data[1].to(device)
71                    outputs = net(inputs)
72                    loss = criterion(outputs, labels)
73                    test_loss.append(loss.item())
74
75        print('epoch {}/{} finished'.format(epoch+1,n_epoch))
76            
77    print('Finished Training')
78    return train_loss, test_loss
79
80# 損失の変遷を表示する関数
81def show_loss(train_loss, test_loss):
82    plt.xlabel("iter")
83    plt.ylabel("loss")
84    x = [i*loss_interval for i in range(len(train_loss))]
85    plt.plot(x, train_loss, label='train_loss')
86    plt.plot(x, test_loss, label='test_loss')
87    plt.legend()
88    plt.show()
89
90# ネットワークの予測精度を計算する関数
91def check_accuracy(net):
92    net.eval()
93    ret = []
94    with torch.no_grad():
95        for loader, name in [[trainloader, 'train'], [testloader, 'test']]:
96            correct = 0
97            total = 0
98            for data in loader:
99                images, labels = data[0].to(device), data[1].to(device)
100                outputs = net(images)
101                _, predicted = torch.max(outputs.data, 1)
102                total += labels.size(0)
103                correct += (predicted == labels).sum().item()
104            ret.append(100 * correct / total)
105            print('Accuracy of the network on the {} images: {:.2f} %'.format(name, ret[-1]))
106
107    return ret

訓練、評価

python
1net = Net()
2net.to(device)
3
4criterion = nn.CrossEntropyLoss()
5optimizer_wd = optim.Adam(net.parameters(), lr=0.001, weight_decay=4e-3)
6
7train_loss, test_loss = train(net, criterion, optimizer_wd, n_epoch = 50)
8show_loss(train_loss, test_loss)
9
10trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
11                                        download=True, transform=transform)
12trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
13                                          shuffle=True, num_workers=2)
14testset = torchvision.datasets.CIFAR10(root='./data', train=False,
15                                       download=True, transform=transform)
16testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
17                                         shuffle=True, num_workers=2)
18
19acc = check_accuracy(net)

このコードが accuracy 73% 程度でした。

より層の多い CNN

モデルの定義を次のように変更しました。

python
1class Net(nn.Module):
2    def __init__(self):
3        super(Net, self).__init__()
4        self.conv1 = nn.Conv2d(3, 64, 3, padding = 1)
5        self.conv7 = nn.Conv2d(64, 64, 3, padding = 1)
6        self.conv2 = nn.Conv2d(64, 128, 3, padding = 1)
7        self.conv6 = nn.Conv2d(128, 128, 3, padding = 1)
8        self.conv3 = nn.Conv2d(128, 256, 3, padding = 1)
9        self.conv5 = nn.Conv2d(256, 256, 3, padding = 1)
10        self.conv4 = nn.Conv2d(256, 16, 3, padding = 1)
11        self.pool = nn.MaxPool2d(2)
12        self.avgpool = nn.AvgPool2d(2)
13        self.fc1 = nn.Linear(16 * 4 * 4, 120)
14        self.fc2 = nn.Linear(120, 84)
15        self.fc3 = nn.Linear(84, 10)
16        self.sm = nn.Softmax(1)
17
18        # dropoutの定義
19        self.dropout1 = nn.Dropout2d(p=0.2)
20        self.dropout2 = nn.Dropout2d(p=0.25)
21        self.dropout3 = nn.Dropout(p=0.3)
22        self.dropout4 = nn.Dropout(p=0.35)
23
24    def forward(self, x):
25        x = F.relu(self.conv1(x))
26        x = F.relu(self.conv7(x))
27        x = self.dropout2(x)
28        x = self.pool(x)
29
30        x = F.relu(self.conv2(x))
31        x = F.relu(self.conv6(x))
32        x = self.dropout2(x)
33        x = self.pool(x)
34
35        x = F.relu(self.conv3(x))
36        x = F.relu(self.conv5(x))
37        x = F.relu(self.conv4(x))
38        x = self.dropout2(x)
39        x = self.avgpool(x)
40        x = x.view(-1, 16 * 4 * 4)
41
42        x = F.relu(self.fc1(x))
43        x = self.dropout3(x)
44        x = F.relu(self.fc2(x))
45        x = self.fc3(x)
46        x = self.sm(x)
47
48        return x

このモデルの summery は

---
        Layer (type)               Output Shape         Param #
===
            Conv2d-1           [-1, 64, 32, 32]           1,792
            Conv2d-2           [-1, 64, 32, 32]          36,928
         Dropout2d-3           [-1, 64, 32, 32]               0
         MaxPool2d-4           [-1, 64, 16, 16]               0
            Conv2d-5          [-1, 128, 16, 16]          73,856
            Conv2d-6          [-1, 128, 16, 16]         147,584
         Dropout2d-7          [-1, 128, 16, 16]               0
         MaxPool2d-8            [-1, 128, 8, 8]               0
            Conv2d-9            [-1, 256, 8, 8]         295,168
           Conv2d-10            [-1, 256, 8, 8]         590,080
           Conv2d-11             [-1, 16, 8, 8]          36,880
        Dropout2d-12             [-1, 16, 8, 8]               0
        AvgPool2d-13             [-1, 16, 4, 4]               0
           Linear-14                  [-1, 120]          30,840
          Dropout-15                  [-1, 120]               0
           Linear-16                   [-1, 84]          10,164
           Linear-17                   [-1, 10]             850
          Softmax-18                   [-1, 10]               0
====
~ (文字数制限のため省略）

このモデルを訓練すると、loss が収束せず、10epoch ほど回してもaccuracyは10%(10種類の画像だから何も学習していない）でした。
ちなみにこのモデルは　ここのベンチマークモデル(Keras)↓を参考に作っています。

def create_bench_model():
    inputs = Input(shape = (32,32,3))
    x = Conv2D(64,(3,3),padding = "SAME",activation= "relu")(inputs)
    x = Conv2D(64,(3,3),padding = "SAME",activation= "relu")(x)
    x = Dropout(0.25)(x)
    x = MaxPooling2D()(x)

    x = Conv2D(128,(3,3),padding = "SAME",activation= "relu")(x)
    x = Conv2D(128,(3,3),padding = "SAME",activation= "relu")(x)
    x = Dropout(0.25)(x)
    x = MaxPooling2D()(x)

    x = Conv2D(256,(3,3),padding = "SAME",activation= "relu")(x)
    x = Conv2D(256,(3,3),padding = "SAME",activation= "relu")(x)
    x = GlobalAveragePooling2D()(x)

    x = Dense(1024,activation = "relu")(x)
    x = Dropout(0.25)(x)
    y = Dense(10,activation = "softmax")(x)

    return Model(input = inputs, output = y)