編集履歴

質問編集履歴

修正

2018/08/10 06:41

投稿

nishibayashi

スコア9

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -36,7 +36,7 @@
 における、自分の実行結果を載せます。
 ![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
-### 全コード
+```python
 import numpy as np
 import pandas as pd
@@ -117,4 +117,5 @@
 Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
 Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
 y_train = Xmat['SalePrice']
-X_train = Xmat.drop(['SalePrice'], axis=1)
+X_train = Xmat.drop(['SalePrice'], axis=1)
+```

情報の修正

2018/08/10 06:41

投稿

nishibayashi

スコア9

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -37,24 +37,19 @@
 ![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
 ### 全コード
-import numpy as np # linear algebra
+import numpy as np
-import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+import pandas as pd
-# for visualization
 import matplotlib.pyplot as plt
 import seaborn as sns
-# データをインポート
 train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
 test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
-# データを調べる。
 print(train.dtypes)
 train.shape
 test.shape
-# ラベル化
 from sklearn.preprocessing import LabelEncoder
 for i in range(train.shape[1]):
@@ -64,50 +59,33 @@
         train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
         test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))
-# 欠損の可視化
 import missingno as msno
 train.head(5)
 msno.matrix(df=train, figsize=(20,14), color=(0.5,0,0))
-# keep ID for submission
 train_ID = train['Id']
 test_ID = test['Id']
-# split data for training
 y_train = train['SalePrice']
 X_train = train.drop(['Id','SalePrice'], axis=1)
 X_test = test.drop('Id', axis=1)
-# dealing with missing data
 Xmat = pd.concat([X_train, X_test])
 Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
 Xmat = Xmat.fillna(Xmat.median())
-#　新しい特徴量を作る。
 Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
-# Xmatを分割して代入（分割間違えないように）
-Xmat.shape
-X_train.shape
-X_test.shape
 X_train=Xmat[0:1460]
 X_test=Xmat[1460:2920]
-X_train.shape
-X_test.shape
-# 目的変数を調べる
 ax = sns.distplot(y_train)
 plt.show()
-# logを使って目的変数の正規化
 y_train = np.log(y_train)
 ax = sns.distplot(y_train)
 plt.show()
-# 特徴量の重要性を調べる。
-# ランダムフォレストで調べる。
 from sklearn.ensemble import RandomForestRegressor
 rf = RandomForestRegressor(n_estimators=80, max_features='auto')
 rf.fit(X_train, y_train)
@@ -120,15 +98,12 @@
 plt.tight_layout()
 plt.show()
-# 重要度の高い30の特徴量を使用。
 X_train = X_train.iloc[:,ranking[:30]]
 X_test = X_test.iloc[:,ranking[:30]]
-# 重要度の高い特徴量2つを使って新しい特徴量を作る。
 X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
 X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
-# 特徴量と目的変数の相関を調べる。
 fig = plt.figure(figsize=(12,7))
 for i in np.arange(30):
     ax = fig.add_subplot(5,6,i+1)
@@ -137,17 +112,9 @@
 plt.tight_layout()
 plt.show()
-# トレインデータの外れ値を処理。
-X_train.shape
 Xmat = X_train
-Xmat.shape
 Xmat['SalePrice'] = y_train
-Xmat.shape
 Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
-Xmat.shape
 Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
 y_train = Xmat['SalePrice']
-Xmat.shape
-X_train = Xmat.drop(['SalePrice'], axis=1)
+X_train = Xmat.drop(['SalePrice'], axis=1)
-X_train.shape
-Xmat.shape

情報の追加

2018/08/09 07:06

投稿

nishibayashi

スコア9

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -34,4 +34,120 @@
 7) 各Featureとターゲットの関係を調べる
 における、自分の実行結果を載せます。
-![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
+![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)
+### 全コード
+import numpy as np # linear algebra
+import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
+# for visualization
+import matplotlib.pyplot as plt
+import seaborn as sns
+# データをインポート
+train = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\train.csv')
+test = pd.read_csv(r'C:\Users\ringo\Documents\Python\House Prices\test.csv')
+# データを調べる。
+print(train.dtypes)
+train.shape
+test.shape
+# ラベル化
+from sklearn.preprocessing import LabelEncoder
+for i in range(train.shape[1]):
+    if train.iloc[:,i].dtypes == object:
+        lbl = LabelEncoder()
+        lbl.fit(list(train.iloc[:,i].values) + list(test.iloc[:,i].values))
+        train.iloc[:,i] = lbl.transform(list(train.iloc[:,i].values))
+        test.iloc[:,i] = lbl.transform(list(test.iloc[:,i].values))
+# 欠損の可視化
+import missingno as msno
+train.head(5)
+msno.matrix(df=train, figsize=(20,14), color=(0.5,0,0))
+# keep ID for submission
+train_ID = train['Id']
+test_ID = test['Id']
+# split data for training
+y_train = train['SalePrice']
+X_train = train.drop(['Id','SalePrice'], axis=1)
+X_test = test.drop('Id', axis=1)
+# dealing with missing data
+Xmat = pd.concat([X_train, X_test])
+Xmat = Xmat.drop(['LotFrontage','MasVnrArea','GarageYrBlt'], axis=1)
+Xmat = Xmat.fillna(Xmat.median())
+#　新しい特徴量を作る。
+Xmat['TotalSF'] = Xmat['TotalBsmtSF'] + Xmat['1stFlrSF'] + Xmat['2ndFlrSF']
+# Xmatを分割して代入（分割間違えないように）
+Xmat.shape
+X_train.shape
+X_test.shape
+X_train=Xmat[0:1460]
+X_test=Xmat[1460:2920]
+X_train.shape
+X_test.shape
+# 目的変数を調べる
+ax = sns.distplot(y_train)
+plt.show()
+# logを使って目的変数の正規化
+y_train = np.log(y_train)
+ax = sns.distplot(y_train)
+plt.show()
+# 特徴量の重要性を調べる。
+# ランダムフォレストで調べる。
+from sklearn.ensemble import RandomForestRegressor
+rf = RandomForestRegressor(n_estimators=80, max_features='auto')
+rf.fit(X_train, y_train)
+print('Training done using Random Forest')
+ranking = np.argsort(-rf.feature_importances_)
+f, ax = plt.subplots(figsize=(11, 9))
+sns.barplot(x=rf.feature_importances_[ranking], y=X_train.columns.values[ranking], orient='h')
+ax.set_xlabel("feature importance")
+plt.tight_layout()
+plt.show()
+# 重要度の高い30の特徴量を使用。
+X_train = X_train.iloc[:,ranking[:30]]
+X_test = X_test.iloc[:,ranking[:30]]
+# 重要度の高い特徴量2つを使って新しい特徴量を作る。
+X_train["Interaction"] = X_train["TotalSF"]*X_train["OverallQual"]
+X_test["Interaction"] = X_test["TotalSF"]*X_test["OverallQual"]
+# 特徴量と目的変数の相関を調べる。
+fig = plt.figure(figsize=(12,7))
+for i in np.arange(30):
+    ax = fig.add_subplot(5,6,i+1)
+    sns.regplot(x=X_train.iloc[:,i], y=y_train)
+plt.tight_layout()
+plt.show()
+# トレインデータの外れ値を処理。
+X_train.shape
+Xmat = X_train
+Xmat.shape
+Xmat['SalePrice'] = y_train
+Xmat.shape
+Xmat = Xmat.drop(Xmat[(Xmat['TotalSF']>5) & (Xmat['SalePrice']<12.5)].index)
+Xmat.shape
+Xmat = Xmat.drop(Xmat[(Xmat['GrLivArea']>5) & (Xmat['SalePrice']<13)].index)
+y_train = Xmat['SalePrice']
+Xmat.shape
+X_train = Xmat.drop(['SalePrice'], axis=1)
+X_train.shape
+Xmat.shape

間違ったタグの修正

2018/08/09 07:01

投稿

nishibayashi

スコア9

title CHANGED Viewed

File without changes

body CHANGED Viewed

File without changes

情報の補足

2018/08/09 05:48

投稿

nishibayashi

スコア9

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -30,4 +30,8 @@
 ### 補足情報（FW/ツールのバージョンなど）
 windows 10
-anaconda,atom 最新アップデートを使用
+anaconda,atom 最新アップデートを使用
+7) 各Featureとターゲットの関係を調べる
+における、自分の実行結果を載せます。
+![イメージ説明](863580f232c59fc79a8096f2bb1209c8.png)