質問編集履歴
4
実際のコードを書き直し
test
CHANGED
File without changes
|
test
CHANGED
@@ -46,7 +46,7 @@
|
|
46
46
|
import seaborn as sns
|
47
47
|
|
48
48
|
# Loading train data
|
49
|
-
df_train = pd.read_csv('/***
|
49
|
+
df_train = pd.read_csv('/***/train.csv')
|
50
50
|
|
51
51
|
# Install XGBoost
|
52
52
|
import xgboost as xgb
|
@@ -61,14 +61,14 @@
|
|
61
61
|
# データ分割
|
62
62
|
# df_trainをtrainとtestに分ける
|
63
63
|
# stratifyに設定したデータが均一になるように分割
|
64
|
-
train, test = train_test_split(df_train, test_size = 0.1, stratify = df_train["
|
64
|
+
train, test = train_test_split(df_train, test_size = 0.1, stratify = df_train["target"])
|
65
65
|
|
66
66
|
# 説明変数、目的変数を格納
|
67
|
-
X_train = train.drop(['i
|
67
|
+
X_train = train.drop(['id','target'], axis=1)
|
68
|
-
y_train = train['
|
68
|
+
y_train = train['target']
|
69
69
|
|
70
|
-
X_test = test.drop(['i
|
70
|
+
X_test = test.drop(['id','target'], axis=1)
|
71
|
-
y_test = test['
|
71
|
+
y_test = test['target']
|
72
72
|
|
73
73
|
print(X_train.shape, y_train.shape)
|
74
74
|
print(X_test.shape, y_test.shape)
|
@@ -116,10 +116,10 @@
|
|
116
116
|
## テストデータで実行する
|
117
117
|
|
118
118
|
# Loading test data
|
119
|
-
df_test = pd.read_csv('/***
|
119
|
+
df_test = pd.read_csv('/***/test.csv')
|
120
120
|
|
121
121
|
# drop 'index' and save with another name
|
122
|
-
df_test2 = df_test.drop('i
|
122
|
+
df_test2 = df_test.drop('id', axis=1)
|
123
123
|
|
124
124
|
# データ形式の変換して、確認
|
125
125
|
df_test3 = xgb.DMatrix(df_test2)
|
@@ -131,9 +131,9 @@
|
|
131
131
|
|
132
132
|
# 予測結果をcsvに保存する
|
133
133
|
|
134
|
-
sample = pd.read_csv('/****/sample_submi
|
134
|
+
sample = pd.read_csv('/****/sample_submission.csv', header=None)
|
135
135
|
sample[1] = pred.astype('int')
|
136
|
-
sample.to_csv('/***
|
136
|
+
sample.to_csv('/***/sample_submission_20230428.csv',index=None, header=None)
|
137
137
|
|
138
138
|
```
|
139
139
|
|
3
# train dataでの予測モデルをTest dataに適用すると# 予測結果をcsvに保存するを変更しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -132,7 +132,7 @@
|
|
132
132
|
# 予測結果をcsvに保存する
|
133
133
|
|
134
134
|
sample = pd.read_csv('/****/sample_submit.csv', header=None)
|
135
|
-
sample[1] = pred
|
135
|
+
sample[1] = pred.astype('int')
|
136
136
|
sample.to_csv('/****/sample_submit_20230328.csv',index=None, header=None)
|
137
137
|
|
138
138
|
```
|
2
「実際のコード」を編集しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -106,8 +106,8 @@
|
|
106
106
|
|
107
107
|
# XGBoostの評価
|
108
108
|
pred = model.predict(dtest).round()
|
109
|
-
print(pred[:5]
|
109
|
+
print(pred) #[:5]
|
110
|
-
print(y_test[:5]
|
110
|
+
print(y_test) #[:5]
|
111
111
|
|
112
112
|
# 正答率
|
113
113
|
from sklearn.metrics import accuracy_score
|
@@ -121,9 +121,13 @@
|
|
121
121
|
# drop 'index' and save with another name
|
122
122
|
df_test2 = df_test.drop('index', axis=1)
|
123
123
|
|
124
|
+
# データ形式の変換して、確認
|
125
|
+
df_test3 = xgb.DMatrix(df_test2)
|
126
|
+
print(type(df_test3))
|
127
|
+
|
124
128
|
# train dataでの予測モデルをTest dataに適用する
|
125
129
|
# 予測
|
126
|
-
pred = model.predict(df_test
|
130
|
+
pred = model.predict(df_test3).round()
|
127
131
|
|
128
132
|
# 予測結果をcsvに保存する
|
129
133
|
|
1
csvを読み込むところから、分析を実行して、結果をテストデータに適用させて、それをcsvファイルに保存するところまでのコードを追記しました。
test
CHANGED
File without changes
|
test
CHANGED
@@ -31,4 +31,105 @@
|
|
31
31
|
|
32
32
|
XGBoostで回帰分析をやった時も学習用データでモデルを作って、今回と同じようにモデルをテストデータにあてはめて予測できたのですが、今回はどうしてうまく行かないのか原因がわかりません。お助け下さい。
|
33
33
|
|
34
|
+
### 実際のコード
|
34
35
|
|
36
|
+
csvを読み込んで、XGBoostでロジスティック回帰分析をして、モデルを得て、モデルをテストデータに適用するところまでのコードは次の通りです。
|
37
|
+
|
38
|
+
test.csvには目的変数となる'Outcome'は与えられていません。
|
39
|
+
|
40
|
+
```python
|
41
|
+
|
42
|
+
# Call main libraries
|
43
|
+
import numpy as np
|
44
|
+
import pandas as pd
|
45
|
+
import matplotlib.pyplot as plt
|
46
|
+
import seaborn as sns
|
47
|
+
|
48
|
+
# Loading train data
|
49
|
+
df_train = pd.read_csv('/*****/train.csv')
|
50
|
+
|
51
|
+
# Install XGBoost
|
52
|
+
import xgboost as xgb
|
53
|
+
|
54
|
+
# Install other libraries
|
55
|
+
import tensorflow as tf
|
56
|
+
from tensorflow.keras.models import Sequential
|
57
|
+
from tensorflow.keras.layers import Activation, Dense, Dropout, Input, BatchNormalization
|
58
|
+
from sklearn.model_selection import train_test_split
|
59
|
+
from sklearn.metrics import mean_squared_error
|
60
|
+
|
61
|
+
# データ分割
|
62
|
+
# df_trainをtrainとtestに分ける
|
63
|
+
# stratifyに設定したデータが均一になるように分割
|
64
|
+
train, test = train_test_split(df_train, test_size = 0.1, stratify = df_train["Outcome"])
|
65
|
+
|
66
|
+
# 説明変数、目的変数を格納
|
67
|
+
X_train = train.drop(['index','Outcome'], axis=1)
|
68
|
+
y_train = train['Outcome']
|
69
|
+
|
70
|
+
X_test = test.drop(['index','Outcome'], axis=1)
|
71
|
+
y_test = test['Outcome']
|
72
|
+
|
73
|
+
print(X_train.shape, y_train.shape)
|
74
|
+
print(X_test.shape, y_test.shape)
|
75
|
+
|
76
|
+
# データ形式の変換
|
77
|
+
dtrain = xgb.DMatrix(X_train, y_train)
|
78
|
+
dtest = xgb.DMatrix(X_test, y_test)
|
79
|
+
|
80
|
+
# パラメータ設定
|
81
|
+
# binary: 二値分類
|
82
|
+
params = {
|
83
|
+
"objective": "binary:logistic",
|
84
|
+
"eval_metric": "logloss",
|
85
|
+
}
|
86
|
+
|
87
|
+
# 履歴保存用の変数
|
88
|
+
history = {}
|
89
|
+
|
90
|
+
# 学習
|
91
|
+
model = xgb.train(
|
92
|
+
params = params,
|
93
|
+
dtrain = dtrain,
|
94
|
+
evals = [(dtrain, "train"), (dtest, "test")],
|
95
|
+
evals_result = history,
|
96
|
+
num_boost_round = 100,
|
97
|
+
early_stopping_rounds = 10,
|
98
|
+
)
|
99
|
+
|
100
|
+
plt.plot(history["train"]["logloss"],label = "train")
|
101
|
+
plt.plot(history["test"]["logloss"],label = "test")
|
102
|
+
plt.legend()
|
103
|
+
plt.xlabel('rounds')
|
104
|
+
plt.ylabel('logloss')
|
105
|
+
plt.show()
|
106
|
+
|
107
|
+
# XGBoostの評価
|
108
|
+
pred = model.predict(dtest).round()
|
109
|
+
print(pred[:5])
|
110
|
+
print(y_test[:5])
|
111
|
+
|
112
|
+
# 正答率
|
113
|
+
from sklearn.metrics import accuracy_score
|
114
|
+
print(accuracy_score(y_test, pred))
|
115
|
+
|
116
|
+
## テストデータで実行する
|
117
|
+
|
118
|
+
# Loading test data
|
119
|
+
df_test = pd.read_csv('/****/test.csv')
|
120
|
+
|
121
|
+
# drop 'index' and save with another name
|
122
|
+
df_test2 = df_test.drop('index', axis=1)
|
123
|
+
|
124
|
+
# train dataでの予測モデルをTest dataに適用する
|
125
|
+
# 予測
|
126
|
+
pred = model.predict(df_test2)
|
127
|
+
|
128
|
+
# 予測結果をcsvに保存する
|
129
|
+
|
130
|
+
sample = pd.read_csv('/****/sample_submit.csv', header=None)
|
131
|
+
sample[1] = pred
|
132
|
+
sample.to_csv('/****/sample_submit_20230328.csv',index=None, header=None)
|
133
|
+
|
134
|
+
```
|
135
|
+
|