質問編集履歴
5
修正
title
CHANGED
File without changes
|
body
CHANGED
@@ -36,7 +36,7 @@
|
|
36
36
|
における、自分の実行結果を載せます。
|
37
37
|

|
38
38
|
|
39
|
-
|
39
|
+
```python
|
40
40
|
import numpy as np
|
41
41
|
import pandas as pd
|
42
42
|
|
@@ -117,4 +117,5 @@
|
|
117
117
|
Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
|
118
118
|
Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
|
119
119
|
y_train = Xmat['SalePrice']
|
120
|
-
X_train = Xmat.drop(['SalePrice'], axis=1)
|
120
|
+
X_train = Xmat.drop(['SalePrice'], axis=1)
|
121
|
+
```
|
4
情報の修正
title
CHANGED
File without changes
|
body
CHANGED
@@ -37,24 +37,19 @@
|
|
37
37
|

|
38
38
|
|
39
39
|
### 全コード
|
40
|
-
import numpy as np
|
40
|
+
import numpy as np
|
41
|
-
import pandas as pd
|
41
|
+
import pandas as pd
|
42
42
|
|
43
|
-
# for visualization
|
44
43
|
import matplotlib.pyplot as plt
|
45
44
|
import seaborn as sns
|
46
45
|
|
47
|
-
# データをインポート
|
48
46
|
train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
|
49
47
|
test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
|
50
48
|
|
51
|
-
# データを調べる。
|
52
49
|
print(train.dtypes)
|
53
50
|
train.shape
|
54
51
|
test.shape
|
55
52
|
|
56
|
-
|
57
|
-
# ラベル化
|
58
53
|
from sklearn.preprocessing import LabelEncoder
|
59
54
|
|
60
55
|
for i in range(train.shape[1]):
|
@@ -64,50 +59,33 @@
|
|
64
59
|
train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
|
65
60
|
test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))
|
66
61
|
|
67
|
-
# 欠損の可視化
|
68
62
|
import missingno as msno
|
69
63
|
train.head(5)
|
70
64
|
msno.matrix(df=train, figsize=(20,14), color=(0.5,0,0))
|
71
65
|
|
72
|
-
|
73
|
-
# keep ID for submission
|
74
66
|
train_ID = train['Id']
|
75
67
|
test_ID = test['Id']
|
76
68
|
|
77
|
-
# split data for training
|
78
69
|
y_train = train['SalePrice']
|
79
70
|
X_train = train.drop(['Id','SalePrice'], axis=1)
|
80
71
|
X_test = test.drop('Id', axis=1)
|
81
72
|
|
82
|
-
# dealing with missing data
|
83
73
|
Xmat = pd.concat([X_train, X_test])
|
84
74
|
Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
|
85
75
|
Xmat = Xmat.fillna(Xmat.median())
|
86
76
|
|
87
|
-
# 新しい特徴量を作る。
|
88
77
|
Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
|
89
78
|
|
90
|
-
# Xmatを分割して代入(分割間違えないように)
|
91
|
-
Xmat.shape
|
92
|
-
X_train.shape
|
93
|
-
X_test.shape
|
94
79
|
X_train=Xmat[0:1460]
|
95
80
|
X_test=Xmat[1460:2920]
|
96
|
-
X_train.shape
|
97
|
-
X_test.shape
|
98
81
|
|
99
|
-
|
100
|
-
# 目的変数を調べる
|
101
82
|
ax = sns.distplot(y_train)
|
102
83
|
plt.show()
|
103
|
-
|
84
|
+
|
104
85
|
y_train = np.log(y_train)
|
105
86
|
ax = sns.distplot(y_train)
|
106
87
|
plt.show()
|
107
88
|
|
108
|
-
|
109
|
-
# 特徴量の重要性を調べる。
|
110
|
-
# ランダムフォレストで調べる。
|
111
89
|
from sklearn.ensemble import RandomForestRegressor
|
112
90
|
rf = RandomForestRegressor(n_estimators=80, max_features='auto')
|
113
91
|
rf.fit(X_train, y_train)
|
@@ -120,15 +98,12 @@
|
|
120
98
|
plt.tight_layout()
|
121
99
|
plt.show()
|
122
100
|
|
123
|
-
# 重要度の高い30の特徴量を使用。
|
124
101
|
X_train = X_train.iloc[:,ranking[:30]]
|
125
102
|
X_test = X_test.iloc[:,ranking[:30]]
|
126
103
|
|
127
|
-
# 重要度の高い特徴量2つを使って新しい特徴量を作る。
|
128
104
|
X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
|
129
105
|
X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
|
130
106
|
|
131
|
-
# 特徴量と目的変数の相関を調べる。
|
132
107
|
fig = plt.figure(figsize=(12,7))
|
133
108
|
for i in np.arange(30):
|
134
109
|
ax = fig.add_subplot(5,6,i+1)
|
@@ -137,17 +112,9 @@
|
|
137
112
|
plt.tight_layout()
|
138
113
|
plt.show()
|
139
114
|
|
140
|
-
# トレインデータの外れ値を処理。
|
141
|
-
X_train.shape
|
142
115
|
Xmat = X_train
|
143
|
-
Xmat.shape
|
144
116
|
Xmat['SalePrice'] = y_train
|
145
|
-
Xmat.shape
|
146
117
|
Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
|
147
|
-
Xmat.shape
|
148
118
|
Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
|
149
119
|
y_train = Xmat['SalePrice']
|
150
|
-
Xmat.shape
|
151
|
-
X_train = Xmat.drop(['SalePrice'], axis=1)
|
120
|
+
X_train = Xmat.drop(['SalePrice'], axis=1)
|
152
|
-
X_train.shape
|
153
|
-
Xmat.shape
|
3
情報の追加
title
CHANGED
File without changes
|
body
CHANGED
@@ -34,4 +34,120 @@
|
|
34
34
|
|
35
35
|
7) 各Featureとターゲットの関係を調べる
|
36
36
|
における、自分の実行結果を載せます。
|
37
|
-

|
37
|
+

|
38
|
+
|
39
|
+
### 全コード
|
40
|
+
import numpy as np # linear algebra
|
41
|
+
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
|
42
|
+
|
43
|
+
# for visualization
|
44
|
+
import matplotlib.pyplot as plt
|
45
|
+
import seaborn as sns
|
46
|
+
|
47
|
+
# データをインポート
|
48
|
+
train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
|
49
|
+
test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
|
50
|
+
|
51
|
+
# データを調べる。
|
52
|
+
print(train.dtypes)
|
53
|
+
train.shape
|
54
|
+
test.shape
|
55
|
+
|
56
|
+
|
57
|
+
# ラベル化
|
58
|
+
from sklearn.preprocessing import LabelEncoder
|
59
|
+
|
60
|
+
for i in range(train.shape[1]):
|
61
|
+
if train.iloc[:,i].dtypes == object:
|
62
|
+
lbl = LabelEncoder()
|
63
|
+
lbl.fit(list(train.iloc[:,i].values) + list(test.iloc[:,i].values))
|
64
|
+
train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
|
65
|
+
test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))
|
66
|
+
|
67
|
+
# 欠損の可視化
|
68
|
+
import missingno as msno
|
69
|
+
train.head(5)
|
70
|
+
msno.matrix(df=train, figsize=(20,14), color=(0.5,0,0))
|
71
|
+
|
72
|
+
|
73
|
+
# keep ID for submission
|
74
|
+
train_ID = train['Id']
|
75
|
+
test_ID = test['Id']
|
76
|
+
|
77
|
+
# split data for training
|
78
|
+
y_train = train['SalePrice']
|
79
|
+
X_train = train.drop(['Id','SalePrice'], axis=1)
|
80
|
+
X_test = test.drop('Id', axis=1)
|
81
|
+
|
82
|
+
# dealing with missing data
|
83
|
+
Xmat = pd.concat([X_train, X_test])
|
84
|
+
Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
|
85
|
+
Xmat = Xmat.fillna(Xmat.median())
|
86
|
+
|
87
|
+
# 新しい特徴量を作る。
|
88
|
+
Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
|
89
|
+
|
90
|
+
# Xmatを分割して代入(分割間違えないように)
|
91
|
+
Xmat.shape
|
92
|
+
X_train.shape
|
93
|
+
X_test.shape
|
94
|
+
X_train=Xmat[0:1460]
|
95
|
+
X_test=Xmat[1460:2920]
|
96
|
+
X_train.shape
|
97
|
+
X_test.shape
|
98
|
+
|
99
|
+
|
100
|
+
# 目的変数を調べる
|
101
|
+
ax = sns.distplot(y_train)
|
102
|
+
plt.show()
|
103
|
+
# logを使って目的変数の正規化
|
104
|
+
y_train = np.log(y_train)
|
105
|
+
ax = sns.distplot(y_train)
|
106
|
+
plt.show()
|
107
|
+
|
108
|
+
|
109
|
+
# 特徴量の重要性を調べる。
|
110
|
+
# ランダムフォレストで調べる。
|
111
|
+
from sklearn.ensemble import RandomForestRegressor
|
112
|
+
rf = RandomForestRegressor(n_estimators=80, max_features='auto')
|
113
|
+
rf.fit(X_train, y_train)
|
114
|
+
print('Training done using Random Forest')
|
115
|
+
|
116
|
+
ranking = np.argsort(-rf.feature_importances_)
|
117
|
+
f, ax = plt.subplots(figsize=(11, 9))
|
118
|
+
sns.barplot(x=rf.feature_importances_[ranking], y=X_train.columns.values[ranking], orient='h')
|
119
|
+
ax.set_xlabel("feature importance")
|
120
|
+
plt.tight_layout()
|
121
|
+
plt.show()
|
122
|
+
|
123
|
+
# 重要度の高い30の特徴量を使用。
|
124
|
+
X_train = X_train.iloc[:,ranking[:30]]
|
125
|
+
X_test = X_test.iloc[:,ranking[:30]]
|
126
|
+
|
127
|
+
# 重要度の高い特徴量2つを使って新しい特徴量を作る。
|
128
|
+
X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
|
129
|
+
X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
|
130
|
+
|
131
|
+
# 特徴量と目的変数の相関を調べる。
|
132
|
+
fig = plt.figure(figsize=(12,7))
|
133
|
+
for i in np.arange(30):
|
134
|
+
ax = fig.add_subplot(5,6,i+1)
|
135
|
+
sns.regplot(x=X_train.iloc[:,i], y=y_train)
|
136
|
+
|
137
|
+
plt.tight_layout()
|
138
|
+
plt.show()
|
139
|
+
|
140
|
+
# トレインデータの外れ値を処理。
|
141
|
+
X_train.shape
|
142
|
+
Xmat = X_train
|
143
|
+
Xmat.shape
|
144
|
+
Xmat['SalePrice'] = y_train
|
145
|
+
Xmat.shape
|
146
|
+
Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
|
147
|
+
Xmat.shape
|
148
|
+
Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
|
149
|
+
y_train = Xmat['SalePrice']
|
150
|
+
Xmat.shape
|
151
|
+
X_train = Xmat.drop(['SalePrice'], axis=1)
|
152
|
+
X_train.shape
|
153
|
+
Xmat.shape
|
2
間違ったタグの修正
title
CHANGED
File without changes
|
body
CHANGED
File without changes
|
1
情報の補足
title
CHANGED
File without changes
|
body
CHANGED
@@ -30,4 +30,8 @@
|
|
30
30
|
|
31
31
|
### 補足情報(FW/ツールのバージョンなど)
|
32
32
|
windows 10
|
33
|
-
anaconda,atom 最新アップデートを使用
|
33
|
+
anaconda,atom 最新アップデートを使用
|
34
|
+
|
35
|
+
7) 各Featureとターゲットの関係を調べる
|
36
|
+
における、自分の実行結果を載せます。
|
37
|
+

|