2層ニューラルネットワーク(誤差逆伝播法)

学習がうまく行われず困っております．
活性化関数にReLU関数を用いた，
2層フィードフォーワード型ニューラルネットワークです．
中間層のニューロン数はとりあえず100としています．
誤差の推移をプロットしてみると，初期は誤差が低下しているのですが，
かなり早期に誤差が収束し，パラメータの更新が行われなくなるといった状態です．

1.ニューラルネット本体

python
1# -*- coding: utf-8 -*-
2
3import numpy as np
4import func  as F
5
6# Two Layer newral network
7class TwoLayerNet:
8    def __init__(self, inodes, hnodes, onodes, lr):
9        # number of nodes
10        self.inodes = inodes
11        self.hnodes = hnodes
12        self.onodes = onodes
13
14        # learning rate
15        self.lr = lr
16
17        # init weight parameter
18        self.W1 = 0.1*np.random.randn(self.inodes, self.hnodes)
19        self.b1 = np.zeros((1, self.hnodes))
20        self.W2 = 0.1*np.random.randn(self.hnodes, self.onodes)
21        self.b2 = np.zeros(self.onodes)
22#
23    def backprop(self, idata, tdata):
24        idata = np.array(idata, ndmin=2)
25        tdata = np.array(tdata, ndmin=2)
26
27        # input =>> hidden
28        xh = np.dot(idata, self.W1) + self.b1
29        mask = (xh <= 0.0)
30        yh = xh.copy()
31        yh[mask] = 0.0
32
33        # hidden =>> output
34        xo = np.dot(yh, self.W2) + self.b2
35        yo = np.array(xo, ndmin=2)
36
37        # cal loss
38        eo = yo - tdata
39
40        # output =>> hidden
41        delta_yh = np.dot(eo, self.W2.T)
42        self.W2 -= self.lr * np.dot(yh.T, eo)
43        self.b2 -= self.lr * np.sum(eo)
44
45        # hidden =>> input
46        delta_xh = delta_yh.copy()
47        delta_xh[mask] = 0.0
48        self.W1 -= self.lr * np.dot(idata.T, delta_xh)
49        self.b1 -= self.lr * np.sum(delta_xh, axis=0)
50
51        return eo**2
52#
53    def feedforward(self, idata):
54        idata = np.array(idata, ndmin=2)
55
56        # input =>> hidden
57        xh = np.dot(idata, self.W1) + self.b1
58        mask = (xh <= 0.0)
59        yh = xh.copy()
60        yh[mask] = 0.0
61
62        # hidden =>> output
63        xo = np.dot(yh, self.W2) + self.b2
64        yo = np.array(xo, ndmin=2)
65
66        return yo
67#
68    def cal_loss(self, o, t):
69        o = np.array(o, ndmin=2)
70        t = np.array(t, ndmin=2)
71        size = o.shape[0]
72        return np.sum((o - t)**2)/size

2.学習を行うメインプログラム

python
1# -*- coding: utf-8 -*-
2
3import time
4import numpy as np
5import matplotlib.pyplot as plt
6from mpl_toolkits.mplot3d import Axes3D
7from TLnet import TwoLayerNet
8
9
10def normal(data):
11    max_val = np.max(data)
12    min_val = np.min(data)
13    out_data = (data - min_val) / (max_val - min_val)
14    return out_data
15
16if __name__ == '__main__':
17    # start
18    time1 = time.time()
19
20    # データのインポート
21    train = np.loadtxt('2D_nonlinear_train.csv', delimiter=',')
22    test  = np.loadtxt('2D_nonlinear_test.csv', delimiter=',')
23
24    # データの分割
25    x_train = train[:, 0]
26    t_train = train[:, 1]
27    x_test  = np.zeros((test.shape[0], 1))
28    t_test  = np.zeros((test.shape[0], 1))
29    x_test[:,0] = test[:, 0]
30    t_test[:,0] = test[:, 1]
31
32    # データの正規化
33    
34    x_train[:] = normal(x_train[:])
35    #x_train[:, 1] = normal(x_train[:, 1])
36    #t_train[:]    = normal(t_train[:])
37    x_test[:]  = normal(x_test[:])
38    #x_test[:, 1]  = normal(x_test[:, 1])
39    #t_test[:]     = normal(t_test[:])
40    
41
42    # 初期設定
43    inodes = 1
44    hnodes = 100
45    onodes = 1
46    learning_rate = 0.001
47    itr_num = 10000
48    train_size = x_train.shape[0]
49    batch_size = 10
50    train_loss_list = []
51    itr_list = []
52
53    i_data = np.zeros((batch_size, inodes))
54    t_data = np.zeros((batch_size, inodes))
55
56    # ニューラルネットワークの初期化
57    nn = TwoLayerNet(inodes, hnodes, onodes, learning_rate)
58    
59    # パラメータの挙動
60    data1 = np.zeros((itr_num, hnodes))
61    data2 = np.zeros((hnodes, itr_num))
62
63    # 学習開始
64    for itr in range(itr_num):
65
66        itr_list.append(itr + 1)
67
68        # 学習データをピック
69        batch_mask = np.random.choice(train_size, batch_size)
70        i_data[:,0] = x_train[batch_mask]
71        t_data[:,0] = t_train[batch_mask]
72
73        # 誤差逆伝播
74        loss = nn.backprop(i_data, t_data)
75        train_loss_list.append(np.sum(loss)/batch_size)
76        
77        data1[itr, :] = nn.W1[0,:]
78        data2[:, itr] = nn.W2[:,0]
79        pass
80
81    plt.plot(itr_list, train_loss_list)
82    plt.show()
83    
84    plt.plot(itr_list, data1[:,0])
85    plt.plot(itr_list, data1[:,1])
86    plt.plot(itr_list, data1[:,2])
87    plt.plot(itr_list, data1[:,3])
88    plt.plot(itr_list, data1[:,4])
89    plt.plot(itr_list, data1[:,5])
90    plt.plot(itr_list, data1[:,6])
91    plt.plot(itr_list, data1[:,7])
92    plt.plot(itr_list, data1[:,8])
93    plt.plot(itr_list, data1[:,9])
94    plt.show()
95    
96    plt.plot(itr_list, data2[0,:])
97    plt.plot(itr_list, data2[1,:])
98    plt.plot(itr_list, data2[2,:])
99    plt.plot(itr_list, data2[3,:])
100    plt.plot(itr_list, data2[4,:])
101    plt.plot(itr_list, data2[5,:])
102    plt.plot(itr_list, data2[6,:])
103    plt.plot(itr_list, data2[7,:])
104    plt.plot(itr_list, data2[8,:])
105    plt.plot(itr_list, data2[9,:])
106    plt.show()
107    print('start', train_loss_list[0], 'fin', train_loss_list[itr_num-1])
108
109    # パラメータの保存
110    np.savetxt('parameter/W1.csv', nn.W1, delimiter=',', fmt='%.5e')
111    #np.savetxt('parameter/b1.csv', nn.b1, delimiter=',', fmt='%.5e')
112    np.savetxt('parameter/W2.csv', nn.W2, delimiter=',', fmt='%.5e')
113    #np.savetxt('parameter/b2.csv', nn.b2, delimiter=',', fmt='%.5e')
114
115    # 誤差の計算
116    o_last = nn.feedforward(x_test)
117    fin_error = nn.cal_loss(o_last, t_test)
118    
119    # データのグラフ表示
120    plt.scatter(x_test[:], t_test[:], c='r')
121    plt.scatter(x_test[:,0], o_last[:,0])
122
123    plt.show()
124    
125    # 誤差の表示
126    print('final error : ', fin_error)
127    total_time = time.time() - time1
128    print('Analysis Time : ', total_time)

3.教師データ

2D_nonlinear_train.csv

csv
1-2.42918,-0.65366
20.46020,0.44412
3-0.51264,-0.49048
41.68039,0.99400
5-1.87141,-0.95515
60.77454,0.69938
70.50775,0.48622
8-1.68369,-0.99363
9-0.74513,-0.67807
100.39743,0.38705
11-1.20763,-0.93478
121.00598,0.84469
132.01414,0.90332
14-2.81780,-0.31816
152.11814,0.85391
161.54556,0.99968
17-1.80319,-0.97312
180.18734,0.18624
19-2.30033,-0.74548
20-1.89301,-0.94854
212.88370,0.25504
220.84278,0.74649
23-1.93772,-0.93343
241.50528,0.99785
252.37553,0.69330
26-2.59081,-0.52335
27-2.54117,-0.56499
282.30274,0.74387
292.54901,0.55850
30-2.95515,-0.18536
311.07075,0.87756
32-1.34700,-0.97506
33-2.56922,-0.54163
341.84158,0.96356
351.48729,0.99652
36-1.76051,-0.98206
37-1.46221,-0.99411
38-0.68339,-0.63142
39-1.66910,-0.99517
400.72538,0.66342
41-2.85516,-0.28253
42-1.28191,-0.95856
43-1.76232,-0.98172
441.94551,0.93061
452.88071,0.25793
46-2.91680,-0.22290
472.30751,0.74068
480.18115,0.18016
49-1.54516,-0.99967
50-1.74340,-0.98514
51-1.37253,-0.98041
521.13253,0.90549
532.95849,0.18208
540.39280,0.38277
55-1.37695,-0.98127
56-0.52808,-0.50387
572.24413,0.78175
58-2.16614,-0.82795
59-2.30540,-0.74210
60-0.38153,-0.37234
61-2.14907,-0.83741
622.62027,0.49803
630.05721,0.05718
642.69852,0.42872
65-0.74841,-0.68047
66-2.31861,-0.73318
67-0.64064,-0.59770
68-1.41704,-0.98820
69-2.12087,-0.85249
70-1.32248,-0.96933
710.55867,0.53005
721.45299,0.99307
73-0.47135,-0.45409
74-2.95849,-0.18208
75-1.16196,-0.91758
761.62244,0.99867
770.16333,0.16260
78-2.75232,-0.37952
79-1.46417,-0.99432
80-0.43863,-0.42470
81-1.95757,-0.92613
821.65249,0.99667
831.30444,0.96474
841.69902,0.99179
850.27679,0.27327
86-1.65713,-0.99628
870.79812,0.71604
88-0.85613,-0.75531
890.11371,0.11347
901.30561,0.96504
912.71132,0.41712
920.14721,0.14668
931.19344,0.92964
94-2.50737,-0.59255
95-1.22375,-0.94038
96-2.14447,-0.83991
97-1.47902,-0.99579
980.95897,0.81860
990.92045,0.79587
1001.11144,0.89634

4.テストデータ

2D_nonlinear_test.csv

csv
1-1.45583,-0.99340
22.30651,0.74135
3-2.46646,-0.62500
4-0.22035,-0.21857
5-1.75282,-0.98348
61.68121,0.99391
7-0.40438,-0.39345
8-2.86926,-0.26897
9-1.24715,-0.94808
100.99451,0.83849
11-0.34849,-0.34148
12-1.03496,-0.85984
13-2.29462,-0.74928
14-2.35016,-0.71136
153.05246,0.08902
16-0.34239,-0.33574
17-0.21854,-0.21681
18-1.99980,-0.90938
192.82342,0.31283
20-2.55879,-0.55036
210.18534,0.18428
220.84254,0.74634
231.32659,0.97033
242.25621,0.77416
252.03810,0.89279
26-2.42171,-0.65929
271.51424,0.99840
281.09567,0.88924
29-0.87904,-0.77013
302.31821,0.73345
31-0.02740,-0.02739
32-1.49032,-0.99676
330.42080,0.40850
342.15972,0.83154
352.18731,0.81590
36-0.56826,-0.53816
37-1.90201,-0.94565
38-2.82840,-0.30810
39-0.10326,-0.10307
40-1.36469,-0.97884
411.08535,0.88447
421.64616,0.99716
430.85374,0.75374
44-0.24523,-0.24278
45-2.72365,-0.40588
46-0.26408,-0.26102
47-2.76358,-0.36908
482.19125,0.81362
49-2.85404,-0.28361
50-1.15384,-0.91432
512.46126,0.62905
520.21929,0.21754
53-2.53900,-0.56678
54-0.87053,-0.76467
551.18904,0.92801
561.13625,0.90706
57-2.79509,-0.33961
58-2.41076,-0.66749
591.76445,0.98131
60-1.64047,-0.99757
610.13556,0.13514
62-2.22426,-0.79398
630.32671,0.32093
640.68426,0.63210
651.96285,0.92413
66-0.70908,-0.65113
671.87555,0.95392
68-2.10325,-0.86156
69-1.77567,-0.97909
70-0.80466,-0.72060
711.98088,0.91709
72-0.90742,-0.78792
73-2.82837,-0.30813
740.02647,0.02647
750.43010,0.41696
762.77286,0.36043
77-1.68917,-0.99300
782.80860,0.32687
79-2.41900,-0.66134
80-0.11983,-0.11954
812.92018,0.21961
821.79221,0.97559
832.56534,0.54488
84-1.66536,-0.99553
850.16021,0.15952
862.93057,0.20946
870.26122,0.25826
881.41935,0.98855
892.19007,0.81430
901.87061,0.95539
912.71052,0.41785
92-2.38674,-0.68518
931.04716,0.86601
94-1.09301,-0.88801
950.73699,0.67206
962.26408,0.76915
970.77800,0.70186
98-2.94704,-0.19332
992.35517,0.70783
100-0.68826,-0.63519
101

** 誤差の収束状況 **

** 学習後テストデータとニューラルネットワークの出力 **

赤：テストデータ
青：ニューラルネットワークの出力
になります

meg_

2020/06/21 04:12

「学習がうまく行われず」とは具体的にはどういうことですか？正解率が低いってことですか？

takamy

2020/06/21 04:17

初期は誤差が低下していることを確認しているのですが，イタレーションがある程度進むと重みパラメータが収束する挙動を示します．また，学習終了後にテストデータと比較しグラフをプロットしても出力が教師データに全く近づいていないといった状況です．

takamy

2020/06/21 04:19

誤差逆伝播法の実装に何か誤りがあるとは思っているのですが，一か月以上悩みましたが，解決に至らず藁にも縋る思いで質問させていただきました．

meg_

2020/06/21 04:25

上記の説明とグラフを質問に追記すると良いでしょう。回答が付きやすくなると思います。ちなみに質問のコードは一から質問者さんが作成されたのでしょうか？もし参考にされたもの（書籍等）があるならそれも追記すると回答者の参考になるかと思います。

meg_

2020/06/21 04:30

未検証ですが、learning rate（learning_rate = 0.001）をもっと小さくしてはどうでしょうか？

takamy

2020/06/21 04:34

学習率はかなり落として解析の検証を行いましたが，効果がありませんでした...

kzm4269

2020/06/28 16:29

sinカーブは非線形性が強いので、自作NNを試験するときの第一段階には向いていないと思います。まず一次関数、二次関数あたりの単純な関数を近似できることを確認してみてはどうでしょうか？これらを近似できるのであればバックプロップのコードに大きなバグはないと思います。そのうえでまだTensorFlowやPyTorchの実装と比べて自作版の性能が劣る場合は、丸め誤差対策などの工夫の有無に起因していると思われます。

takamy

2020/07/02 13:15

ありがとうございます．検証してみたいと思います．