ディープラーニングでの結果と損失率が正しく出力されません

ゼロから作るDeepLearning　を読んで、C＋＋とCUDAで作成しています。
誤差逆伝播の実装において、異なる画像データを入力しても、softmax関数の出力結果が常に同じ値になってしまします。
また損失関数の値も、最初は小さくなっていくのですが、
２～３の間で下がりません。もしくは、乱数によっては、一定まで下がったら
急に損失率があがり、infを返します。
その時の出力結果は、10個のうち一つを「１」と返してきます。

添付画像では、損失関数とsoftmax関数の出力結果と正解データになります。

Affineレイヤー
Reluレイヤー
Softmaxレイヤー
の実装が間違っていると思い、
手計算できる行列で、電卓を使いながら計算して追いかけましたが、
出力結果は正しかったです。

重み・バイアスの更新時になにか注意することなどがあるのでしょうか？
原因を教えてください。

C++
1//行列classになります。
2//行列の四則演算をCUDAで計算させています。
3//四則演算は電卓を使って、正しい答えが出る事を確認してあります。
4#pragma once
5
6
7namespace GPU {
8	
9　　　　　//オペレータでデバイスメモリを取得できる
10	class matrix final {
11　　　　　　double* m{ nullptr };							//GPU側配列
12publci:
13//GPU配列のポインタを取得
14 operator double* ()const { return m; };
15		
16//省略
17}
18
19
20

layerを作る為のCUDA定義・宣言

CUDA
1///////////////////////////////
2//ReLU関数  0以上はそのまま返す
3///////////////////////////////
4/*
5        { ( x > 0 )  =  x
6 h(x) = {  
7        { ( x <= 0 ) = 0
8*/
9__global__ void ReLU(double* m, double* out, size_t size);
10
11__global__ void backReLU(double* m, double* out, size_t size);
12
13//////////////////////////////////////////////
14//出力関数　　（出力値が高い奴が　正解　と判定）
15/////////////////////////////////////////////
16//ソフトマックス関数
17/*
18		　　　   exp(a[k] + C)
19	y[k] = ーーーーーーーーーー
20				 n
21				∑ exp(a[i] + C)
22				i=1
23*/
24__global__ void SOFTMAX(double* m, size_t row,size_t col,  double* out);
25
26
27////////////////
28//行列の列を足す
29////////////////
30
31__global__ void Matrix_col_Sum(double* x, double* y, int col_size, int size);
32

CUDA
1
2//定義
3
4/////////////
5//活性化関数
6/////////////
7
8//ReLU関数  0以上はそのまま返す
9__global__ void ReLU(double* m,double* out, size_t size) {
10
11	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
12
13	while (size > x) {
14
15		if (m[x] < 0) {
16			out[x] = 0;
17		}
18		else {
19			out[x] = m[x];
20		}
21
22		x += blockDim.x * gridDim.x;
23	}
24
25	return;
26}
27
28__global__ void backReLU(double* m, double* out, size_t size) {
29
30	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
31
32	while (size > x) {
33
34		if (m[x] <= 0) {
35			out[x] = 0;
36		}
37		x += blockDim.x * gridDim.x;
38	}
39
40	return;
41}
42
43
44///////////////////////
45
46//ソフトマックス関数
47
48// <<<1,行数>>> で使用する  //行分の並列
49__global__ void SOFTMAX(double* m, size_t row, size_t col, double* out) {
50
51	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
52	
53	while (row > x) {
54
55		double* exp_a = new double[col];
56		double sum_exp_a = 0.0;
57
58		//最初の要素を入れる
59		double tmp = m[x];
60
61		//要素の最大を取得
62		for (int i = 0; i < col; i++) {
63			if (tmp < m[(row * i) + x]) {
64				tmp = m[(row * i) + x];
65			}
66		}
67
68		//exp(a[k])
69		for (int i = 0; i < col; i++) {
70			exp_a[i] = exp(m[(row * i) + x] - tmp);
71		}
72
73
74		for (int i = 0; i < col; i++) {
75			sum_exp_a += exp(m[(row * i) + x] - tmp);
76		}
77
78		for (int i = 0; i < col; i++) {
79			out[(row * i) + x] = exp_a[i] / sum_exp_a;
80		}
81
82		delete[] exp_a;
83
84		x += blockDim.x * gridDim.x;
85	}
86}
87
88
89//クロスエントロピー誤差
90
91double CrossEntropyError(GPU::matrix& y, GPU::matrix& t) {
92
93	double sum = 0.0;
94
95	
96	auto pY = y.read_vec2();
97	auto pT = t.read_vec2();
98
99	for (int i = 0; i < y.col(); i++) {
100		for (int j = 0; j < y.row(); j++) {
101			if (pT[j][i] != 0) {
102				sum += pT[j][i] * std::log(pY[j][i]);
103			}
104		}
105		
106	}
107
108
109	return -sum / y.row();
110}
111
112
113/////////////////
114//行列の列を足す
115////////////////
116
117// 1 入力　２　出力　３　行数　４行列サイズ     << <1, 列数 >> >
118__global__ void Matrix_col_Sum(double* X, double* y, int row_size, int col_size) {
119
120	unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
121	int n2 = x * row_size;
122
123	y[x] = 0.0;
124
125
126	while (col_size > x) {
127
128		for (int i = 0; i < row_size; i++) {
129			y[x] += X[n2 + i ];
130		}
131
132			x += blockDim.x * gridDim.x;
133	}
134}
135

各レイヤー

C++
1class Affine {
2		GPU::matrix* w, * b;
3		GPU::matrix x;
4		GPU::matrix dw, db;
5	public:
6		Affine() = delete;
7		Affine(GPU::matrix* w, GPU::matrix* b) :w(w), b(b) {};
8
9		GPU::matrix forward(GPU::matrix* x) {
10			this->x = *x;
11			return (this->x * (*w)) + (*b);
12		}
13
14		GPU::matrix backward(GPU::matrix* dout) {
15			auto Wt = *this->w;
16			Wt.permutation();
17			auto dx = (*dout) * Wt;
18
19			auto Xt = this->x;
20			Xt.permutation();
21			this->dw = Xt * (*dout);
22
23			db.resize_write(1, dout->col());
24			//列で加算する
25			Matrix_col_Sum << <1, dout->col() >> >
26				(*dout, this->db, dout->row(), dout->col());
27			cudaThreadSynchronize();
28			return dx;
29		}
30
31		GPU::matrix get_dw() { return dw; };
32		GPU::matrix get_db() { return db; };
33
34	};

C++
1class Relu {
2		GPU::matrix data;
3	public:
4		Relu() {}
5
6		GPU::matrix forward(GPU::matrix* x) {
7			data = *x;
8
9			const int n2 = x->row() * x->col();
10
11			ReLU
12				<< < (n2 + g_inf.prop->maxThreadsDim[0] - 1) / g_inf.prop->maxThreadsDim[0], g_inf.prop->maxThreadsDim[0] >> >
13				(*x, data, n2);
14			cudaThreadSynchronize();
15			return data;
16		}
17
18		GPU::matrix backward(GPU::matrix* dout) {
19
20			GPU::matrix dx = *dout;
21
22			const int n2 = dx.row() * dx.col();
23			backReLU
24				<< < (n2 + g_inf.prop->maxThreadsDim[0] - 1) / g_inf.prop->maxThreadsDim[0], g_inf.prop->maxThreadsDim[0] >> >
25				(data, dx, n2);
26			cudaThreadSynchronize();
27			return dx;
28		}
29
30	};

C++
1class SoftmaxWithLoss {
2
3		double loss{ NULL };
4		
5		GPU::matrix y, t;
6	public:
7
8
9		double forward(GPU::matrix* x, GPU::matrix* t) {
10
11
12			y.resize_write(x->row(), x->col(), 0);
13
14			SOFTMAX << < 1, x->row() >> >
15				(*x, x->row(), x->col(), y);
16			cudaThreadSynchronize(); 
17
18			this->t = *t;
19
20			this->loss = CrossEntropyError(y, *t);
21
22			return this->loss;
23		}
24
25		GPU::matrix Out(GPU::matrix* x, GPU::matrix* t) {
26
27			y.resize_write(x->row(), x->col(), 0);
28
29			SOFTMAX << < 1, x->row() >> >
30				(*x, x->row(), x->col(), y);
31			cudaThreadSynchronize();
32
33			
34			return y;
35		}
36
37		GPU::matrix backward(double dout = 1) {
38			double batch_size = t.row();
39			return (this->y - this->t) / batch_size;
40
41		}
42
43	};

main関数のでの処理

「画像データ読み込み」関数を作って、data,lavel,testdata,testlabelに読み込みこんでいます。
「g_Log->・・・」はログを取る為にテキストファイルに書き込んでいます。

C++
1
2////////////////
3//FILE読込
4////////////////
5		//省略 
6                //vector<vector>で　data　、　label　に入っている
7
8		
9		constexpr	size_t  iters_num = 10000;				//繰り返し回数
10		constexpr int batch_size = 25;
11		constexpr double learning_rate = 0.01;				//学習率
12		const double input_size = data[0].size(), hidden_size = 50, output_size = label[0].size();
13
14
15	//損失関数の出力を入れる
16		std::list<double> lossList;
17
18
19//行列の作成
20//W：重みは正規分布に従って乱数生成
21//b：０で初期化
22		GPU::matrix AffineVectorW1(W1, input_size, hidden_size);
23		GPU::matrix AffineVectorB1(1, hidden_size, 0);
24		GPU::matrix AffineVectorW2(W2, hidden_size, output_size);
25		GPU::matrix AffineVectorB2(1, output_size, 0);
26		AffineVectorW1 *= learning_rate;
27		AffineVectorW2 *= learning_rate;
28
29//レイヤーの作成
30		Test2Network::Affine ClassAffine1(&AffineVectorW1, &AffineVectorB1);
31		Test2Network::Relu ClassRelu;
32		Test2Network::Affine ClassAffine2(&AffineVectorW2, &AffineVectorB2);
33		Test2Network::SoftmaxWithLoss ClassSoftmax;
34//損失が0.8未満になるまでループ
35		for (int i=0;;i++) {
36
37			//ミニバッチの乱数取得
38			std::vector<int> rund_batch_mask, rund_testBatch_mask;				//乱数を入れる
39
40
41		//乱数生成器
42			std::random_device rnd;								// 非決定的な乱数生成器を生成
43			std::mt19937 mt(rnd());								//  メルセンヌ・ツイスタの32ビット版、引数は初期シード値
44			std::uniform_int_distribution<> rand100(0, data.size() - 1);	// [0, 60000] 範囲の一様乱数
45			std::uniform_int_distribution<> rand_testData(0, testdata.size() - 1);	// [0, testデータサイズ] 範囲の一様乱数
46
47
48			//乱数の取得
49			for (int j = 0; j < batch_size; j++) {
50				rund_batch_mask.push_back(rand100(mt));
51				rund_testBatch_mask.push_back(rand_testData(mt));
52			}
53
54			std::vector<std::vector<double>> X_batch(batch_size, std::vector<double>(input_size, 0));
55			std::vector<std::vector<double>> T_batch(batch_size, std::vector<double>(output_size, 0));
56			
57
58
59			//無作為にデータを取得
60			for (int j = 0; j < batch_size; j++) {
61					X_batch[j] = data[rund_batch_mask[j]];
62　　　　　　　}
63
64			//正解データを無作為に取得
65			for (int j = 0; j < batch_size; j++) {
66				T_batch[j] = label[rund_batch_mask[j]];
67			}
68
69
70			//学習データの取得
71			GPU::matrix x_batch(X_batch);
72
73			//正解データの保存
74			GPU::matrix t_batch(T_batch);
75
76			
77
78//ここからディープラーニング
79
80			//std::wcout << L" X" << std::endl;
81			//x_batch.show();
82			//std::wcout << L" W1" << std::endl;
83			//AffineVectorW1.show();
84			auto Aff1 = ClassAffine1.forward(&x_batch);
85
86
87			//std::wcout << L" Aff1" << std::endl;
88			//Aff1.show();
89			auto Relu1 = ClassRelu.forward(&Aff1);
90
91			//std::wcout << L" Relu1" << std::endl;
92			//Relu1.show();
93
94
95			//std::wcout << L" W2" << std::endl;
96			//AffineVectorW2.show();
97			auto Aff2 = ClassAffine2.forward(&Relu1);
98
99			//Aff2.show();
100			double loss = ClassSoftmax.forward(&Aff2, &t_batch);
101
102			lossList.push_back(loss);
103
104			//std::wcout << L"バックワード" << std::endl;
105
106			auto last = ClassSoftmax.backward(1);
107			//last.show();
108
109			auto last2 = ClassAffine2.backward(&last);
110			//last2.show();
111
112			auto last3 = ClassRelu.backward(&last2);
113			//last3.show();
114
115			auto last4 = ClassAffine1.backward(&last3);
116			//last4.show();
117
118　　　　　　　//更新
119			AffineVectorW1 -= ClassAffine1.get_dw() * learning_rate;
120			AffineVectorB1 -= ClassAffine1.get_db() * learning_rate;
121			AffineVectorW2 -= ClassAffine2.get_dw() * learning_rate;
122			AffineVectorB2 -= ClassAffine2.get_db() * learning_rate;
123
124			if ((i % 4000) == 0) {
125				std::wcout << L"損失 : " << loss << std::endl;
126				std::wcout << L"予測" << std::endl;
127				ClassSoftmax.Out(&Aff2, &t_batch).show();
128				std::wcout << L"正解" << std::endl;
129				t_batch.show();
130			
131				std::wcout << L"ーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーーー" << std::endl;
132			}
133
134			if (loss < 0.8) {
135				break;
136			}
137
138		}
139
140