質問編集履歴
1
コードおよび一部データの追記
test
CHANGED
File without changes
|
test
CHANGED
@@ -3,3 +3,381 @@
|
|
3
3
|
複数のアルゴリズムでTestとTrainを実施したのですが、testがTrainより良くなり、しかも多くのTestが1.0という結果になります(添付参照)。※データは練習用として適当に作成。
|
4
4
|
|
5
5
|
こういったことはあり得るのでしょうか?それとも高い可能性で途中のコードに間違いがあるものでしょうか?
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
下記にコードを記載いたします。まだ初心者のため、同じコードを書いたり、見にくいと思いますが、どうぞご容赦いただけたらと思います。
|
10
|
+
|
11
|
+
データは一部のみですが、添付します。leftが目的変数となります。](7ed368c6b255a386d5b29449c363a03b.png)
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
```Python
|
16
|
+
|
17
|
+
# salaryとsalesのデータの型をobjectで統一している
|
18
|
+
|
19
|
+
# import sample data: Loan screening data for classification
|
20
|
+
|
21
|
+
ohe_columns = ['sales',
|
22
|
+
|
23
|
+
'salary']
|
24
|
+
|
25
|
+
my_dtype = {'sales':object,
|
26
|
+
|
27
|
+
'salary':object}
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
import pandas as pd
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
df = pd.read_csv('./data/finaldayP.csv',header=0, dtype=my_dtype)
|
36
|
+
|
37
|
+
X = df.iloc[:,[0,2,3,4,5,6,7,8,9,10]] # 最終列以前を特徴量X
|
38
|
+
|
39
|
+
X = X.drop('index',axis=1)# 1列目はID情報のため特徴量から削除
|
40
|
+
|
41
|
+
y = df.iloc[:,1] # 2番目を正解データ
|
42
|
+
|
43
|
+
|
44
|
+
|
45
|
+
# check the shape
|
46
|
+
|
47
|
+
print('X shape: (%i,%i)' %X.shape)
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
print('--------------------')
|
52
|
+
|
53
|
+
print(y.value_counts())
|
54
|
+
|
55
|
+
X.join(y).head()
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
#カテゴリカル変数のohe化
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
X_new = pd.get_dummies(X,
|
68
|
+
|
69
|
+
dummy_na=True,
|
70
|
+
|
71
|
+
columns=ohe_columns)
|
72
|
+
|
73
|
+
|
74
|
+
|
75
|
+
display(X_new.head())
|
76
|
+
|
77
|
+
print(X_new.shape)
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
X_new.describe()
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
from sklearn.impute import SimpleImputer
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
# インピュータークラスのインスタンス化と(列平均の)学習 imputerは欠損値を平均値や中央値で保管する
|
90
|
+
|
91
|
+
imp = SimpleImputer()
|
92
|
+
|
93
|
+
imp.fit(X_new)
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
# 学習済みImputerの適用:各列の欠損値の置換 前処理の時はtransform (Predictではない)
|
98
|
+
|
99
|
+
X_ohe_columns = X_new.columns.values
|
100
|
+
|
101
|
+
X_ohe = pd.DataFrame(imp.transform(X_new),columns=X_ohe_columns)
|
102
|
+
|
103
|
+
X=X_ohe
|
104
|
+
|
105
|
+
# 結果表示
|
106
|
+
|
107
|
+
display(X.head())
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
# スコアデータの読み込み
|
112
|
+
|
113
|
+
import pandas as pd
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
df_s = pd.read_csv('./data/finaldayPtest.csv',header=0, dtype=my_dtype)
|
118
|
+
|
119
|
+
X_s = df_s.iloc[:,[0,2,3,4,5,6,7,8,9,10]]
|
120
|
+
|
121
|
+
X_s = X_s.drop('index',axis=1)
|
122
|
+
|
123
|
+
y_s = df_s.iloc[:,1]
|
124
|
+
|
125
|
+
|
126
|
+
|
127
|
+
# check the shape
|
128
|
+
|
129
|
+
print('Raw shape: (%i,%i)' %df_s.shape)
|
130
|
+
|
131
|
+
print('X shape: (%i,%i)' %X_s.shape)
|
132
|
+
|
133
|
+
print('-------------------------------')
|
134
|
+
|
135
|
+
print(X_s.dtypes)
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
#スコア用データの前処理:カテゴリ変数の数量化と欠損対応
|
140
|
+
|
141
|
+
X_ohe_s = pd.get_dummies(X_s,
|
142
|
+
|
143
|
+
dummy_na=True,
|
144
|
+
|
145
|
+
columns=ohe_columns)
|
146
|
+
|
147
|
+
print('X_ohe_s shape:(%i,%i)' % X_ohe_s.shape)
|
148
|
+
|
149
|
+
X_ohe_s.head()
|
150
|
+
|
151
|
+
|
152
|
+
|
153
|
+
# Pythonの集合型変数を利用 setを用いることによって差異を見ることができる
|
154
|
+
|
155
|
+
cols_model = set(X_ohe.columns.values)
|
156
|
+
|
157
|
+
cols_score = set(X_ohe_s.columns.values)
|
158
|
+
|
159
|
+
|
160
|
+
|
161
|
+
# モデルにはあったスコアにはないデータ項目
|
162
|
+
|
163
|
+
diff1 = cols_model - cols_score
|
164
|
+
|
165
|
+
print('Modelのみ:%s' % diff1)
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
# スコアにはあるがモデルになかったデータ項
|
170
|
+
|
171
|
+
diff2 = cols_score - cols_model
|
172
|
+
|
173
|
+
print('Scoreのみ:%s' % diff2)
|
174
|
+
|
175
|
+
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
df_cols_m = pd.DataFrame(None,
|
180
|
+
|
181
|
+
columns=X_ohe_columns,
|
182
|
+
|
183
|
+
dtype=float)
|
184
|
+
|
185
|
+
display(df_cols_m)
|
186
|
+
|
187
|
+
|
188
|
+
|
189
|
+
X_ohe_s2 = pd.concat([df_cols_m, X_ohe_s])
|
190
|
+
|
191
|
+
print(X_ohe_s2.shape)
|
192
|
+
|
193
|
+
display(X_ohe_s2.head(3))
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
#スコアリングにあるが、モデルにない特徴量を削除
|
198
|
+
|
199
|
+
set_Xm = set(X_ohe.columns.values)
|
200
|
+
|
201
|
+
set_Xs = set(X_ohe_s.columns.values)
|
202
|
+
|
203
|
+
|
204
|
+
|
205
|
+
X_ohe_s3 = X_ohe_s2.drop(list(set_Xs-set_Xm),axis=1)
|
206
|
+
|
207
|
+
|
208
|
+
|
209
|
+
print(X_ohe_s3.shape)
|
210
|
+
|
211
|
+
display(X_ohe_s3.head(3))
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
|
216
|
+
|
217
|
+
X_ohe_s3.loc[:,list(set_Xm-set_Xs)] = X_ohe_s3.loc[:,list(set_Xm-set_Xs)].fillna(0,axis=1)
|
218
|
+
|
219
|
+
X_ohe_s3.head(3)
|
220
|
+
|
221
|
+
|
222
|
+
|
223
|
+
#reindex関数を使うことによって並びを制御。
|
224
|
+
|
225
|
+
X_ohe_s3 = X_ohe_s3.reindex(X_ohe.columns.values,axis=1)
|
226
|
+
|
227
|
+
X_ohe_s3.head(3)
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
# isnullが欠損地でそのyes, noをカウントして欠損個数を表示している
|
232
|
+
|
233
|
+
print('欠損個数(数値変数の欠損補完前)',X_ohe_s3.isnull().sum().sum())
|
234
|
+
|
235
|
+
X_ohe_s4 = pd.DataFrame(imp.transform(X_ohe_s3),columns=X_ohe_columns)
|
236
|
+
|
237
|
+
print('欠損個数(数値変数の欠損補完後)',X_ohe_s4.isnull().sum().sum())
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
X_fin_s = X_ohe_s4
|
242
|
+
|
243
|
+
print(X_fin_s.shape)
|
244
|
+
|
245
|
+
X_fin_s.head(3)
|
246
|
+
|
247
|
+
|
248
|
+
|
249
|
+
X.join(y).head()
|
250
|
+
|
251
|
+
from sklearn.metrics import accuracy_score
|
252
|
+
|
253
|
+
# import libraries
|
254
|
+
|
255
|
+
import numpy as np
|
256
|
+
|
257
|
+
import pandas as pd
|
258
|
+
|
259
|
+
from sklearn.model_selection import train_test_split
|
260
|
+
|
261
|
+
from sklearn.neighbors import KNeighborsClassifier
|
262
|
+
|
263
|
+
from sklearn.linear_model import LogisticRegression
|
264
|
+
|
265
|
+
from sklearn.preprocessing import StandardScaler
|
266
|
+
|
267
|
+
from sklearn.pipeline import Pipeline
|
268
|
+
|
269
|
+
from sklearn.svm import SVC, LinearSVC
|
270
|
+
|
271
|
+
from sklearn.tree import DecisionTreeClassifier
|
272
|
+
|
273
|
+
from sklearn.ensemble import RandomForestClassifier
|
274
|
+
|
275
|
+
from sklearn.ensemble import GradientBoostingClassifier
|
276
|
+
|
277
|
+
from sklearn.neural_network import MLPClassifier
|
278
|
+
|
279
|
+
from sklearn.pipeline import Pipeline
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
|
284
|
+
|
285
|
+
# Holdout
|
286
|
+
|
287
|
+
X_train,X_test,y_train,y_test = train_test_split(X,
|
288
|
+
|
289
|
+
y,
|
290
|
+
|
291
|
+
test_size=0.20,
|
292
|
+
|
293
|
+
random_state=1)
|
294
|
+
|
295
|
+
# set pipelines for two different algorithms
|
296
|
+
|
297
|
+
pipelines ={
|
298
|
+
|
299
|
+
'knn': Pipeline([('scl',StandardScaler()),
|
300
|
+
|
301
|
+
('est',KNeighborsClassifier())]),
|
302
|
+
|
303
|
+
|
304
|
+
|
305
|
+
'logistic': Pipeline([('scl',StandardScaler()),
|
306
|
+
|
307
|
+
('est',LogisticRegression(random_state=1))]),
|
308
|
+
|
309
|
+
'rsvc':
|
310
|
+
|
311
|
+
Pipeline([('scl',StandardScaler()),
|
312
|
+
|
313
|
+
('est',SVC(C=1.0, kernel='rbf', class_weight='balanced', random_state=1))]),
|
314
|
+
|
315
|
+
'lsvc':
|
316
|
+
|
317
|
+
Pipeline([('scl',StandardScaler()),
|
318
|
+
|
319
|
+
('est',LinearSVC(C=1.0, class_weight='balanced', random_state=1))]),
|
320
|
+
|
321
|
+
'tree':
|
322
|
+
|
323
|
+
Pipeline([('scl',StandardScaler()),
|
324
|
+
|
325
|
+
('est',DecisionTreeClassifier(random_state=1))]),
|
326
|
+
|
327
|
+
'rf':
|
328
|
+
|
329
|
+
Pipeline([('scl',StandardScaler()),
|
330
|
+
|
331
|
+
('est',RandomForestClassifier(random_state=1))]),
|
332
|
+
|
333
|
+
'gb':
|
334
|
+
|
335
|
+
Pipeline([('scl',StandardScaler()),
|
336
|
+
|
337
|
+
('est',GradientBoostingClassifier(random_state=1))]),
|
338
|
+
|
339
|
+
'mlp':
|
340
|
+
|
341
|
+
Pipeline([('scl',StandardScaler()),
|
342
|
+
|
343
|
+
('est',MLPClassifier(hidden_layer_sizes=(3,3),
|
344
|
+
|
345
|
+
max_iter=1000,
|
346
|
+
|
347
|
+
random_state=1))])
|
348
|
+
|
349
|
+
}
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
# fit the models
|
354
|
+
|
355
|
+
for pipe_name, pipeline in pipelines.items():
|
356
|
+
|
357
|
+
pipeline.fit(X_train,y_train)
|
358
|
+
|
359
|
+
print(pipe_name, ': Fitting Done')
|
360
|
+
|
361
|
+
print(X.shape)
|
362
|
+
|
363
|
+
print(y.shape)
|
364
|
+
|
365
|
+
|
366
|
+
|
367
|
+
|
368
|
+
|
369
|
+
from sklearn.metrics import f1_score
|
370
|
+
|
371
|
+
scores = {}
|
372
|
+
|
373
|
+
for pipe_name, pipeline in pipelines.items():
|
374
|
+
|
375
|
+
scores[(pipe_name,'train')] = accuracy_score(y_train, pipeline.predict(X_train))
|
376
|
+
|
377
|
+
scores[(pipe_name,'test')] = accuracy_score(y_test, pipeline.predict(X_test))
|
378
|
+
|
379
|
+
|
380
|
+
|
381
|
+
pd.Series(scores).unstack()
|
382
|
+
|
383
|
+
```
|