ニューラルネットワークによるクラス分類を行いました。評価方法はホールドアウト検証、交差検証、層化k分割交差検証の3種類で行いました。
7つのデータセットについてパラメータを交差検証を用いたグリッドサーチにより求めました。ハイパーパラメータはhidden_layer_sizesとmax_iterの2種類用いました。ニューラルネットワークのハイパーパラメータの条件は、hidden_layer_sizesが(100,100)、(10,10)、(100,)、(10,)の4種類とmax_iterが10000,1000,100,10の4種類です。以下が交差検証を用いたグリッドサーチのコードになります。
Python
1import pandas as pd 2from sklearn.model_selection import train_test_split 3from sklearn import preprocessing 4from sklearn.neural_network import MLPClassifier 5from sklearn.model_selection import GridSearchCV 6 7df = pd.read_csv('sasa_2019.csv', 8 encoding = "shift-jis", 9 skiprows = 1, 10 names = ['ID','menseki','totiriyou','0722R','0604R','0611R','0525A','0606A','0513A','R_avg','A_avg','rorn','0404N','0504N','0511N','00524N','1001N','1028N','1117N','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','0409N','0416N','0516N','0625N','1102N','1115N','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42']) 11 12sasa=df.drop(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','35','36','37','38','39','40','41','42'],axis = 1) 13sasa = sasa.dropna(how='any') 14 15X=sasa.loc[:,["A_avg",'0404N','0504N','0511N','00524N','1001N','1028N','1117N','0409N','0416N','0516N','0625N','1102N','1115N']] 16y=sasa['rorn'] 17 18X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 19 20scaler = preprocessing.StandardScaler() 21scaler.fit(X_train) 22X_train = scaler.transform(X_train) 23X_test = scaler.transform(X_test) 24 25param_grid = {'hidden_layer_sizes': [(100,100),(10,10),(100,),(10,)], 26 'max_iter': [10000,1000,100,10]} 27 28grid_search = GridSearchCV(MLPClassifier(random_state=0), param_grid, cv=5) 29grid_search.fit(X_train, y_train) 30print("P.260~") 31print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test))) 32print("Best parameters: {}".format(grid_search.best_params_)) 33print("Best cross-validation score: {:.2f}".format(grid_search.best_score_)) 34print("Best estimator:\n{}".format(grid_search.best_estimator_))
上記のコードで求められた最適なパラメータをを用いてクラス分類を行ったところ、分類精度はホールドアウト検証の方が交差検証・層化k分割交差検証よりも0.1ほど良い結果が得られました。
これはニューラルネットワークの特徴か何かが要因となっているのでしょうか?
以下が各評価方法の結果を出力したコードになります。
Python
1import pandas as pd 2from sklearn.model_selection import train_test_split 3from sklearn import preprocessing 4from sklearn.neural_network import MLPClassifier 5from sklearn.model_selection import cross_val_score 6from sklearn.model_selection import KFold 7 8df = pd.read_csv('sasa_2019.csv', 9 encoding = "shift-jis", 10 skiprows = 1, 11 names = ['ID','menseki','totiriyou','0722R','0604R','0611R','0525A','0606A','0513A','R_avg','A_avg','rorn','0404N','0504N','0511N','00524N','1001N','1028N','1117N','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','0409N','0416N','0516N','0625N','1102N','1115N','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42']) 12 13sasa=df.drop(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','35','36','37','38','39','40','41','42'],axis = 1) 14sasa = sasa.dropna(how='any') 15 16X=sasa.loc[:,["A_avg","0504N","1117N"]] 17y=sasa['rorn'] 18 19X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 20 21scaler = preprocessing.StandardScaler() 22scaler.fit(X_train) 23X_train = scaler.transform(X_train) 24X_test = scaler.transform(X_test) 25 26clf = MLPClassifier(hidden_layer_sizes=(100,100),max_iter=1000) 27clf.fit(X_train, y_train) 28 29score = cross_val_score(clf,X,y,cv=5) 30print("Cross -Validation score:{}".format(score)) 31 32kfold = KFold(n_splits=5) 33print("Cross-Validation scoreK:\n{}".format( 34 cross_val_score(clf,X,y,cv=kfold))) 35 36print ("正解率=",clf.score(X_test, y_test))
この情報だけではなんとも言えませんので、もっと詳しく書いてください。
(クラス不均衡があるかどうか、各評価方法における詳細な実験条件、使用した評価指標など。また、実験に用いたコードがあるといいでしょう)
説明が少なく申し訳ありませんでした。ニューラルネットワークのハイパーパラメータの条件は、hidden_layer_sizesが(100,100)、(10,10)、(100,)、(10,)の4種類とmax_iterが10000,1000,100,10の4種類です。
以下が交差検証を用いたグリッドサーチのコードになります。
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
#csvファイルの読み込み
df = pd.read_csv('sasa_2019.csv',
encoding = "shift-jis",
skiprows = 1,
names = ['ID','menseki','totiriyou','0722R','0604R','0611R','0525A','0606A','0513A','R_avg','A_avg','rorn','0404N','0504N','0511N','00524N','1001N','1028N','1117N','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','0409N','0416N','0516N','0625N','1102N','1115N','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42'])
#使わない列の削除
sasa=df.drop(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','35','36','37','38','39','40','41','42'],axis = 1)
sasa = sasa.dropna(how='any')
X=sasa.loc[:,["A_avg",'0404N','0504N','0511N','00524N','1001N','1028N','1117N','0409N','0416N','0516N','0625N','1102N','1115N']]
y=sasa['rorn']
#教師データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
#標準化
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
param_grid = {'hidden_layer_sizes': [(100,100),(10,10),(100,),(10,)],
'max_iter': [10000,1000,100,10]}
grid_search = GridSearchCV(MLPClassifier(random_state=0), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
print("Best estimator:\n{}".format(grid_search.best_estimator_))
また各評価方法を表示させたコードは以下のようになります。
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
#csvファイルの読み込み
df = pd.read_csv('sasa_2019.csv',
encoding = "shift-jis",
skiprows = 1,
names = ['ID','menseki','totiriyou','0722R','0604R','0611R','0525A','0606A','0513A','R_avg','A_avg','rorn','0404N','0504N','0511N','00524N','1001N','1028N','1117N','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','0409N','0416N','0516N','0625N','1102N','1115N','22','23','24','25','26','27','28','29','30','31','32','33','34','35','36','37','38','39','40','41','42'])
#使わない列の削除
sasa=df.drop(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33','34','35','35','36','37','38','39','40','41','42'],axis = 1)
sasa = sasa.dropna(how='any')
X=sasa.loc[:,["A_avg","0416N",'1102N']]
y=sasa['rorn']
#教師データとテストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
#標準化
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
#ニューラルネットワークによる分類
clf = MLPClassifier(hidden_layer_sizes=(100,),max_iter=10)
clf.fit(X_train, y_train)
#交差検証の結果
score = cross_val_score(clf,X,y,cv=5)
print("Cross -Validation score:{}".format(score))
#層化K分割交差検証の結果
kfold = KFold(n_splits=5)
print("Cross-Validation scoreK:\n{}".format(
cross_val_score(clf,X,y,cv=kfold)))
#正解率の表示の結果
print ("正解率=",clf.score(X_test, y_test))
よろしくお願いいたします。
この欄ではなく質問を編集して追記してください(ここに書かれても見づらいしコードブロックが使えない。質問なら
```
```
で囲めばコードとして適切に表示される)。
回答1件
あなたの回答
tips
プレビュー