Random Forestで評価を行い抽出した重要度を縦棒グラフで表示したい。

私は現在までにRandom Forestで学習、評価を行い、重要度の抽出を行いました。
抽出した重要度を横棒グラフで表示することができたのですが縦棒グラフで表示することができません。

以下にコードとエラー文を載せておきます。

何かアドバイスをいただけると嬉しいです。
よろしくお願いします。

python
1from sklearn.ensemble import RandomForestRegressor 
2import numpy as np
3import pandas as pd
4import matplotlib.pyplot as plt
5import seaborn as sns
6from sklearn.model_selection import train_test_split
7from sklearn.metrics import r2_score
8
9#データの読み込み＆表示
10df = pd.read_csv("study.csv", encoding="utf-8")
11df = df.fillna(df.mean())
12
13#dataの列を削除
14df = df.drop("data", axis=1)
15
16#yを目的変数、xを説明変数として指定
17y = df.loc[:,"water_temperature"]
18x = df.loc[:,["airtemperature","precipitation","surface_pressure","u_component_of_wind_10m","water_level"]]
19
20#x、yを学習用、評価用に8:2で分割
21x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, train_size = 0.8, shuffle = True)
22
23#正解率を出す
24rf = RandomForestRegressor()
25rf.fit(x_train, y_train)
26y_pred = rf.predict(x_test)
27print("正解率＝", r2_score(y_test, y_pred))
28
29#特徴量重要度を並べてみる
30models = ["RF"]
31importances = pd.DataFrame({"features":x,
32                            models[0]:rf.feature_importances_})
33 
34#データフレームをDTの降順に並び替えて表示
35importances.sort_values("RF",ascending=False)
36
37#ランダムフォレストの説明変数の重要度をデータフレーム化
38fea_rf_imp = pd.DataFrame({'imp': rf.feature_importances_, 'col': x.columns.values})
39fea_rf_imp = fea_rf_imp.sort_values(by='imp', ascending=False)[::-1]
40
41# ランダムフォレストの重要度を可視化
42fig = plt.figure()
43plt.figure(figsize=(10, 7))
44sns.barplot('imp','col',data=fea_rf_imp,orient='v',palette = "Blues")
45plt.style.use('ggplot')
46plt.title('Random Forest - Feature Importance',fontsize=28)
47plt.ylabel('Features',fontsize=18)
48plt.xlabel('Importance',fontsize=18)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\nanops.py in _ensure_numeric(x)
   1302         try:
-> 1303             x = float(x)
   1304         except ValueError:

ValueError: could not convert string to float: 'precipitation'

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
~\anaconda3\lib\site-packages\pandas\core\nanops.py in _ensure_numeric(x)
   1306             try:
-> 1307                 x = complex(x)
   1308             except ValueError:

ValueError: complex() arg is a malformed string

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-23-2e3b4ad9498e> in <module>
      6 fig = plt.figure()
      7 plt.figure(figsize=(10, 7))
----> 8 sns.barplot('imp','col',data=fea_rf_imp,orient='v',palette = "Blues")
      9 plt.style.use('ggplot')
     10 plt.title('Random Forest - Feature Importance',fontsize=28)

~\anaconda3\lib\site-packages\seaborn\categorical.py in barplot(x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, seed, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge, ax, **kwargs)
   3148                           estimator, ci, n_boot, units, seed,
   3149                           orient, color, palette, saturation,
-> 3150                           errcolor, errwidth, capsize, dodge)
   3151 
   3152     if ax is None:

~\anaconda3\lib\site-packages\seaborn\categorical.py in __init__(self, x, y, hue, data, order, hue_order, estimator, ci, n_boot, units, seed, orient, color, palette, saturation, errcolor, errwidth, capsize, dodge)
   1615                                  order, hue_order, units)
   1616         self.establish_colors(color, palette, saturation)
-> 1617         self.estimate_statistic(estimator, ci, n_boot, seed)
   1618 
   1619         self.dodge = dodge

~\anaconda3\lib\site-packages\seaborn\categorical.py in estimate_statistic(self, estimator, ci, n_boot, seed)
   1497                     statistic.append(np.nan)
   1498                 else:
-> 1499                     statistic.append(estimator(stat_data))
   1500 
   1501                 # Get a confidence interval for this estimate

<__array_function__ internals> in mean(*args, **kwargs)

~\anaconda3\lib\site-packages\numpy\core\fromnumeric.py in mean(a, axis, dtype, out, keepdims)
   3330             pass
   3331         else:
-> 3332             return mean(axis=axis, dtype=dtype, out=out, **kwargs)
   3333 
   3334     return _methods._mean(a, axis=axis, dtype=dtype,

~\anaconda3\lib\site-packages\pandas\core\generic.py in stat_func(self, axis, skipna, level, numeric_only, **kwargs)
  11215             return self._agg_by_level(name, axis=axis, level=level, skipna=skipna)
  11216         return self._reduce(
> 11217             f, name, axis=axis, skipna=skipna, numeric_only=numeric_only
  11218         )
  11219 

~\anaconda3\lib\site-packages\pandas\core\series.py in _reduce(self, op, name, axis, skipna, numeric_only, filter_type, **kwds)
   3889                 )
   3890             with np.errstate(all="ignore"):
-> 3891                 return op(delegate, skipna=skipna, **kwds)
   3892 
   3893         # TODO(EA) dispatch to Index

~\anaconda3\lib\site-packages\pandas\core\nanops.py in _f(*args, **kwargs)
     67             try:
     68                 with np.errstate(invalid="ignore"):
---> 69                     return f(*args, **kwargs)
     70             except ValueError as e:
     71                 # we want to transform an object array

~\anaconda3\lib\site-packages\pandas\core\nanops.py in f(values, axis, skipna, **kwds)
    123                     result = alt(values, axis=axis, skipna=skipna, **kwds)
    124             else:
--> 125                 result = alt(values, axis=axis, skipna=skipna, **kwds)
    126 
    127             return result

~\anaconda3\lib\site-packages\pandas\core\nanops.py in nanmean(values, axis, skipna, mask)
    540         dtype_count = dtype
    541     count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
--> 542     the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
    543 
    544     if axis is not None and getattr(the_sum, "ndim", False):

~\anaconda3\lib\site-packages\pandas\core\nanops.py in _ensure_numeric(x)
   1308             except ValueError:
   1309                 # e.g. "foo"
-> 1310                 raise TypeError(f"Could not convert {x} to numeric")
   1311     return x
   1312 

TypeError: Could not convert precipitation to numeric

yureighost

2020/09/18 03:46

データの詳細がわからないので何とも言えないのですが、 sns.barplotの'imp'と'col'の引数の位置を逆にして渡すとどうなるでしょうか。

aokikenichi

2020/09/18 03:47

エラーコードが読みにくいので、Pythonコードと同じくコード表示にしていただけると助かります。 precipitation to numeric だけ見ますと値の変換の問題なのでグラフの縦横は関係ないかなと思われますが、横の棒グラフのコードは示していただけますでしょうか。どこをどう変えたのでしょうか。

buffalo

2020/09/18 04:13

@yureighost 引数の位置を逆にしたところ上記と同じエラーが出ました。

buffalo

2020/09/18 04:16

@aokikenichi 横の棒グラフの場合はsns.barplotのorient='v'をorient='h'にするとエラーなくグラフが作成されます。

行動規範の内容に同意します

回答1件

ベストアンサー

colとimpを入替て出来ました。
orient='v'した時に図だけが縦横変わってくれたらよいのですが、変数を入れ替えるので
文字列型の方を数値として棒グラフに書こうとしてエラーとなっているようですね
ですので指定するcolとimpを入れ替えて解決です

Python
1fig = plt.figure()
2plt.figure(figsize=(10, 7))
3sns.barplot('col', 'imp',data=fea_rf_imp,orient='v',palette = "Blues")
4plt.style.use('ggplot')
5plt.title('Random Forest - Feature Importance',fontsize=28)
6plt.ylabel('Features',fontsize=18)
7plt.xlabel('Importance',fontsize=18)