質問編集履歴
5
修正
test
CHANGED
File without changes
|
test
CHANGED
@@ -74,7 +74,7 @@
|
|
74
74
|
|
75
75
|
|
76
76
|
|
77
|
-
|
77
|
+
```python
|
78
78
|
|
79
79
|
import numpy as np
|
80
80
|
|
@@ -237,3 +237,5 @@
|
|
237
237
|
y_train = Xmat['SalePrice']
|
238
238
|
|
239
239
|
X_train = Xmat.drop(['SalePrice'], axis=1)
|
240
|
+
|
241
|
+
```
|
4
情報の修正
test
CHANGED
File without changes
|
test
CHANGED
@@ -76,13 +76,11 @@
|
|
76
76
|
|
77
77
|
### 全コード
|
78
78
|
|
79
|
-
import numpy as np
|
79
|
+
import numpy as np
|
80
|
-
|
81
|
-
|
80
|
+
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
81
|
+
import pandas as pd
|
82
|
+
|
83
|
+
|
86
84
|
|
87
85
|
import matplotlib.pyplot as plt
|
88
86
|
|
@@ -90,16 +88,12 @@
|
|
90
88
|
|
91
89
|
|
92
90
|
|
93
|
-
# データをインポート
|
94
|
-
|
95
91
|
train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
|
96
92
|
|
97
93
|
test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
|
98
94
|
|
99
95
|
|
100
96
|
|
101
|
-
# データを調べる。
|
102
|
-
|
103
97
|
print(train.dtypes)
|
104
98
|
|
105
99
|
train.shape
|
@@ -108,10 +102,6 @@
|
|
108
102
|
|
109
103
|
|
110
104
|
|
111
|
-
|
112
|
-
|
113
|
-
# ラベル化
|
114
|
-
|
115
105
|
from sklearn.preprocessing import LabelEncoder
|
116
106
|
|
117
107
|
|
@@ -130,8 +120,6 @@
|
|
130
120
|
|
131
121
|
|
132
122
|
|
133
|
-
# 欠損の可視化
|
134
|
-
|
135
123
|
import missingno as msno
|
136
124
|
|
137
125
|
train.head(5)
|
@@ -140,18 +128,12 @@
|
|
140
128
|
|
141
129
|
|
142
130
|
|
143
|
-
|
144
|
-
|
145
|
-
# keep ID for submission
|
146
|
-
|
147
131
|
train_ID = train['Id']
|
148
132
|
|
149
133
|
test_ID = test['Id']
|
150
134
|
|
151
135
|
|
152
136
|
|
153
|
-
# split data for training
|
154
|
-
|
155
137
|
y_train = train['SalePrice']
|
156
138
|
|
157
139
|
X_train = train.drop(['Id','SalePrice'], axis=1)
|
@@ -160,8 +142,6 @@
|
|
160
142
|
|
161
143
|
|
162
144
|
|
163
|
-
# dealing with missing data
|
164
|
-
|
165
145
|
Xmat = pd.concat([X_train, X_test])
|
166
146
|
|
167
147
|
Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
|
@@ -170,39 +150,21 @@
|
|
170
150
|
|
171
151
|
|
172
152
|
|
173
|
-
# 新しい特徴量を作る。
|
174
|
-
|
175
153
|
Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
|
176
154
|
|
177
155
|
|
178
156
|
|
179
|
-
# Xmatを分割して代入(分割間違えないように)
|
180
|
-
|
181
|
-
Xmat.shape
|
182
|
-
|
183
|
-
X_train.shape
|
184
|
-
|
185
|
-
X_test.shape
|
186
|
-
|
187
157
|
X_train=Xmat[0:1460]
|
188
158
|
|
189
159
|
X_test=Xmat[1460:2920]
|
190
160
|
|
191
|
-
|
161
|
+
|
192
|
-
|
193
|
-
X_test.shape
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
# 目的変数を調べる
|
200
162
|
|
201
163
|
ax = sns.distplot(y_train)
|
202
164
|
|
203
165
|
plt.show()
|
204
166
|
|
205
|
-
|
167
|
+
|
206
168
|
|
207
169
|
y_train = np.log(y_train)
|
208
170
|
|
@@ -212,12 +174,6 @@
|
|
212
174
|
|
213
175
|
|
214
176
|
|
215
|
-
|
216
|
-
|
217
|
-
# 特徴量の重要性を調べる。
|
218
|
-
|
219
|
-
# ランダムフォレストで調べる。
|
220
|
-
|
221
177
|
from sklearn.ensemble import RandomForestRegressor
|
222
178
|
|
223
179
|
rf = RandomForestRegressor(n_estimators=80, max_features='auto')
|
@@ -242,24 +198,18 @@
|
|
242
198
|
|
243
199
|
|
244
200
|
|
245
|
-
# 重要度の高い30の特徴量を使用。
|
246
|
-
|
247
201
|
X_train = X_train.iloc[:,ranking[:30]]
|
248
202
|
|
249
203
|
X_test = X_test.iloc[:,ranking[:30]]
|
250
204
|
|
251
205
|
|
252
206
|
|
253
|
-
# 重要度の高い特徴量2つを使って新しい特徴量を作る。
|
254
|
-
|
255
207
|
X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
|
256
208
|
|
257
209
|
X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
|
258
210
|
|
259
211
|
|
260
212
|
|
261
|
-
# 特徴量と目的変数の相関を調べる。
|
262
|
-
|
263
213
|
fig = plt.figure(figsize=(12,7))
|
264
214
|
|
265
215
|
for i in np.arange(30):
|
@@ -276,30 +226,14 @@
|
|
276
226
|
|
277
227
|
|
278
228
|
|
279
|
-
# トレインデータの外れ値を処理。
|
280
|
-
|
281
|
-
X_train.shape
|
282
|
-
|
283
229
|
Xmat = X_train
|
284
230
|
|
285
|
-
Xmat.shape
|
286
|
-
|
287
231
|
Xmat['SalePrice'] = y_train
|
288
232
|
|
289
|
-
Xmat.shape
|
290
|
-
|
291
233
|
Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
|
292
234
|
|
293
|
-
Xmat.shape
|
294
|
-
|
295
235
|
Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
|
296
236
|
|
297
237
|
y_train = Xmat['SalePrice']
|
298
238
|
|
299
|
-
Xmat.shape
|
300
|
-
|
301
239
|
X_train = Xmat.drop(['SalePrice'], axis=1)
|
302
|
-
|
303
|
-
X_train.shape
|
304
|
-
|
305
|
-
Xmat.shape
|
3
情報の追加
test
CHANGED
File without changes
|
test
CHANGED
@@ -71,3 +71,235 @@
|
|
71
71
|
における、自分の実行結果を載せます。
|
72
72
|
|
73
73
|
![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
|
74
|
+
|
75
|
+
|
76
|
+
|
77
|
+
### 全コード
|
78
|
+
|
79
|
+
import numpy as np # linear algebra
|
80
|
+
|
81
|
+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
82
|
+
|
83
|
+
|
84
|
+
|
85
|
+
# for visualization
|
86
|
+
|
87
|
+
import matplotlib.pyplot as plt
|
88
|
+
|
89
|
+
import seaborn as sns
|
90
|
+
|
91
|
+
|
92
|
+
|
93
|
+
# データをインポート
|
94
|
+
|
95
|
+
train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
|
96
|
+
|
97
|
+
test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
# データを調べる。
|
102
|
+
|
103
|
+
print(train.dtypes)
|
104
|
+
|
105
|
+
train.shape
|
106
|
+
|
107
|
+
test.shape
|
108
|
+
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
# ラベル化
|
114
|
+
|
115
|
+
from sklearn.preprocessing import LabelEncoder
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
for i in range(train.shape[1]):
|
120
|
+
|
121
|
+
if train.iloc[:,i].dtypes == object:
|
122
|
+
|
123
|
+
lbl = LabelEncoder()
|
124
|
+
|
125
|
+
lbl.fit(list(train.iloc[:,i].values) + list(test.iloc[:,i].values))
|
126
|
+
|
127
|
+
train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
|
128
|
+
|
129
|
+
test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))
|
130
|
+
|
131
|
+
|
132
|
+
|
133
|
+
# 欠損の可視化
|
134
|
+
|
135
|
+
import missingno as msno
|
136
|
+
|
137
|
+
train.head(5)
|
138
|
+
|
139
|
+
msno.matrix(df=train, figsize=(20,14), color=(0.5,0,0))
|
140
|
+
|
141
|
+
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
# keep ID for submission
|
146
|
+
|
147
|
+
train_ID = train['Id']
|
148
|
+
|
149
|
+
test_ID = test['Id']
|
150
|
+
|
151
|
+
|
152
|
+
|
153
|
+
# split data for training
|
154
|
+
|
155
|
+
y_train = train['SalePrice']
|
156
|
+
|
157
|
+
X_train = train.drop(['Id','SalePrice'], axis=1)
|
158
|
+
|
159
|
+
X_test = test.drop('Id', axis=1)
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
# dealing with missing data
|
164
|
+
|
165
|
+
Xmat = pd.concat([X_train, X_test])
|
166
|
+
|
167
|
+
Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
|
168
|
+
|
169
|
+
Xmat = Xmat.fillna(Xmat.median())
|
170
|
+
|
171
|
+
|
172
|
+
|
173
|
+
# 新しい特徴量を作る。
|
174
|
+
|
175
|
+
Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
|
176
|
+
|
177
|
+
|
178
|
+
|
179
|
+
# Xmatを分割して代入(分割間違えないように)
|
180
|
+
|
181
|
+
Xmat.shape
|
182
|
+
|
183
|
+
X_train.shape
|
184
|
+
|
185
|
+
X_test.shape
|
186
|
+
|
187
|
+
X_train=Xmat[0:1460]
|
188
|
+
|
189
|
+
X_test=Xmat[1460:2920]
|
190
|
+
|
191
|
+
X_train.shape
|
192
|
+
|
193
|
+
X_test.shape
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
# 目的変数を調べる
|
200
|
+
|
201
|
+
ax = sns.distplot(y_train)
|
202
|
+
|
203
|
+
plt.show()
|
204
|
+
|
205
|
+
# logを使って目的変数の正規化
|
206
|
+
|
207
|
+
y_train = np.log(y_train)
|
208
|
+
|
209
|
+
ax = sns.distplot(y_train)
|
210
|
+
|
211
|
+
plt.show()
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
|
216
|
+
|
217
|
+
# 特徴量の重要性を調べる。
|
218
|
+
|
219
|
+
# ランダムフォレストで調べる。
|
220
|
+
|
221
|
+
from sklearn.ensemble import RandomForestRegressor
|
222
|
+
|
223
|
+
rf = RandomForestRegressor(n_estimators=80, max_features='auto')
|
224
|
+
|
225
|
+
rf.fit(X_train, y_train)
|
226
|
+
|
227
|
+
print('Training done using Random Forest')
|
228
|
+
|
229
|
+
|
230
|
+
|
231
|
+
ranking = np.argsort(-rf.feature_importances_)
|
232
|
+
|
233
|
+
f, ax = plt.subplots(figsize=(11, 9))
|
234
|
+
|
235
|
+
sns.barplot(x=rf.feature_importances_[ranking], y=X_train.columns.values[ranking], orient='h')
|
236
|
+
|
237
|
+
ax.set_xlabel("feature importance")
|
238
|
+
|
239
|
+
plt.tight_layout()
|
240
|
+
|
241
|
+
plt.show()
|
242
|
+
|
243
|
+
|
244
|
+
|
245
|
+
# 重要度の高い30の特徴量を使用。
|
246
|
+
|
247
|
+
X_train = X_train.iloc[:,ranking[:30]]
|
248
|
+
|
249
|
+
X_test = X_test.iloc[:,ranking[:30]]
|
250
|
+
|
251
|
+
|
252
|
+
|
253
|
+
# 重要度の高い特徴量2つを使って新しい特徴量を作る。
|
254
|
+
|
255
|
+
X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
|
256
|
+
|
257
|
+
X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
|
258
|
+
|
259
|
+
|
260
|
+
|
261
|
+
# 特徴量と目的変数の相関を調べる。
|
262
|
+
|
263
|
+
fig = plt.figure(figsize=(12,7))
|
264
|
+
|
265
|
+
for i in np.arange(30):
|
266
|
+
|
267
|
+
ax = fig.add_subplot(5,6,i+1)
|
268
|
+
|
269
|
+
sns.regplot(x=X_train.iloc[:,i], y=y_train)
|
270
|
+
|
271
|
+
|
272
|
+
|
273
|
+
plt.tight_layout()
|
274
|
+
|
275
|
+
plt.show()
|
276
|
+
|
277
|
+
|
278
|
+
|
279
|
+
# トレインデータの外れ値を処理。
|
280
|
+
|
281
|
+
X_train.shape
|
282
|
+
|
283
|
+
Xmat = X_train
|
284
|
+
|
285
|
+
Xmat.shape
|
286
|
+
|
287
|
+
Xmat['SalePrice'] = y_train
|
288
|
+
|
289
|
+
Xmat.shape
|
290
|
+
|
291
|
+
Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
|
292
|
+
|
293
|
+
Xmat.shape
|
294
|
+
|
295
|
+
Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
|
296
|
+
|
297
|
+
y_train = Xmat['SalePrice']
|
298
|
+
|
299
|
+
Xmat.shape
|
300
|
+
|
301
|
+
X_train = Xmat.drop(['SalePrice'], axis=1)
|
302
|
+
|
303
|
+
X_train.shape
|
304
|
+
|
305
|
+
Xmat.shape
|
2
間違ったタグの修正
test
CHANGED
File without changes
|
test
CHANGED
File without changes
|
1
情報の補足
test
CHANGED
File without changes
|
test
CHANGED
@@ -63,3 +63,11 @@
|
|
63
63
|
windows 10
|
64
64
|
|
65
65
|
anaconda,atom 最新アップデートを使用
|
66
|
+
|
67
|
+
|
68
|
+
|
69
|
+
7) 各Featureとターゲットの関係を調べる
|
70
|
+
|
71
|
+
における、自分の実行結果を載せます。
|
72
|
+
|
73
|
+
![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
|