matplotlibのplotを行う際の行列整理について

いま、以下のコードによりリッジ回帰を行っています。
この場合、コードの終盤にあるplotにより問題なく散布図を出すことができます。

Python
1%matplotlib inline
2import matplotlib.pyplot as plt
3import numpy as np
4import pandas as pd
5from sklearn.linear_model import LinearRegression
6from sklearn.preprocessing import StandardScaler
7from sklearn.model_selection import train_test_split
8from sklearn.metrics import mean_squared_error
9from sklearn.preprocessing import PolynomialFeatures
10from sklearn.linear_model import Ridge
11
12data=pd.read_excel('元データ.xlsx')
13data=data.drop([0,1]).reset_index(drop=True)
14data['date']=pd.to_datetime(data['date'],format='%Y年%m月')
15data['POSIX']=data['date'].astype('int64').values//10**9
16data['year']=data['date'].dt.year
17data['month']=data['date'].dt.month
18
19x=data.iloc[0:38:1,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,32]]
20y0=data.iloc[:38,[23]]
21y1=data.iloc[:38,[24]]
22y2=data.iloc[:38,[25]]
23y3=data.iloc[:38,[26]]
24y4=data.iloc[:38,[27]]
25y5=data.iloc[:38,[28]]
26y6=data.iloc[:38,[29]]
27
28for y in [y0, y1, y2, y3, y4, y5, y6]:
29    x_pos=data['POSIX'].values
30    N=len(x)
31    N_train=round(len(x)*0.8)
32    N_test=N-N_train
33
34    x_pos_train,x_pos_test=x_pos[:N_train],x_pos[N_train:]
35    x_train,y_train=x[:N_train],y[:N_train]
36    x_test,y_test=x[N_train:],y[N_train:]
37    
38    POLY=PolynomialFeatures(degree=6,include_bias=False)
39
40    x_train_pol=POLY.fit_transform(x_train)
41    x_test_pol=POLY.transform(x_test)
42
43    sc=StandardScaler()
44    x_train_std=sc.fit_transform(x_train_pol)
45    x_test_std=sc.transform(x_test_pol)
46
47    model2=Ridge(alpha=0.8)
48    model2.fit(x_train_std,y_train)
49    y_train_pred=model2.predict(x_train_std)
50    y_test_pred=model2.predict(x_test_std)
51    
52    print('平均二乗誤差（訓練データ）：',mean_squared_error(y_train,y_train_pred))
53    print('平均二乗誤差（テストデータ）：',mean_squared_error(y_test,y_test_pred))
54
55    #残差プロット
56    plt.figure(figsize=(8,4))
57    plt.scatter(y_train_pred, y_train_pred-y_train,c='red',marker='o',edgecolor='white',label='Training data')
58    plt.scatter(y_test_pred, y_test_pred-y_test,c='blue',marker='s',edgecolor='white',label='Test data')
59    plt.xlabel('Predicted values')
60    plt.ylabel('Residuals')
61    plt.legend(loc='upper left') 
62    plt.hlines(y=0,xmin=0,xmax=100000000000,color='black',lw=0.5)
63    plt.xlim([10000000,100000000])
64    plt.tight_layout()
65    plt.show()

一方、以下のロッソ回帰を行うと描画の際にエラーとなります。
エラーメッセージも含めて以下記述します。モデルを変えただけで、あとはほぼすべてコードは同じです。もちろん元データも同じものです。

Python
1%matplotlib inline
2import matplotlib.pyplot as plt
3import numpy as np
4import pandas as pd
5from sklearn.linear_model import LinearRegression
6from sklearn.preprocessing import StandardScaler
7from sklearn.model_selection import train_test_split
8from sklearn.metrics import mean_squared_error
9from sklearn.preprocessing import PolynomialFeatures
10from sklearn.linear_model import Ridge
11from sklearn.linear_model import Lasso
12
13
14data=pd.read_excel('元データ.xlsx')
15data=data.drop([0,1]).reset_index(drop=True)
16data['date']=pd.to_datetime(data['date'],format='%Y年%m月')
17data['POSIX']=data['date'].astype('int64').values//10**9
18data['year']=data['date'].dt.year
19data['month']=data['date'].dt.month
20
21x=data.iloc[0:38:1,[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,32]]
22y0=data.iloc[:38,[23]] 
23y1=data.iloc[:38,[24]]
24y2=data.iloc[:38,[25]]
25y3=data.iloc[:38,[26]]
26y4=data.iloc[:38,[27]]
27y5=data.iloc[:38,[28]]
28y6=data.iloc[:38,[29]]
29
30for y in [y0, y1, y2, y3, y4, y5, y6]:
31
32    x_pos=data['POSIX'].values
33    N=len(x)
34    N_train=round(len(x)*0.8)
35    N_test=N-N_train
36
37    x_pos_train,x_pos_test=x_pos[:N_train],x_pos[N_train:]
38    x_train,y_train=x[:N_train],y[:N_train]
39    x_test,y_test=x[N_train:],y[N_train:]
40
41    POLY=PolynomialFeatures(degree=6,include_bias=False)
42    x_train_pol=POLY.fit_transform(x_train)
43    x_test_pol=POLY.transform(x_test)
44
45    sc=StandardScaler()
46    x_train_std=sc.fit_transform(x_train_pol)
47    x_test_std=sc.transform(x_test_pol)
48    
49    model3=Lasso(alpha=0.1)
50    model3.fit(x_train_std,y_train)
51    
52    y_train_pred=model3.predict(x_train_std)
53    y_test_pred=model3.predict(x_test_std)
54
55    print('平均二乗誤差（訓練データ）：',mean_squared_error(y_train,y_train_pred))
56    print('平均二乗誤差（テストデータ）：',mean_squared_error(y_test,y_test_pred))
57    
58    #y_train_pred=np.array(y_train_pred).reshape(1,-1).tolist()
59    #y_test_pred=np.array(y_test_pred).reshape(1,-1).tolist()
60    plt.figure(figsize=(8,4))
61    plt.scatter(y_train_pred,y_train_pred-y_train,c='red',marker='o',edgecolor='white',label='Training data')
62    plt.scatter(y_test_pred, y_test_pred-y_test,c='blue',marker='s',edgecolor='white',label='Test data')
63    plt.xlabel('Predicted values')
64    plt.ylabel('Residuals')
65    plt.legend(loc='upper left')
66    plt.hlines(y=0,xmin=0,xmax=100000000,color='black',lw=0.5)
67    plt.xlim([10000000,100000000])
68    plt.tight_layout()
69    plt.show()
70
71平均二乗誤差（訓練データ）： 9334.400142192504
72平均二乗誤差（テストデータ）： 7933540129871.662
73---------------------------------------------------------------------------
74ValueError                                Traceback (most recent call last)
75<ipython-input-525-38e77eb1362b> in <module>
76     68     #y_test_pred=np.array(y_test_pred).reshape(1,-1).tolist()
77     69     plt.figure(figsize=(8,4))
78---> 70     plt.scatter(y_train_pred,y_train_pred-y_train,c='red',marker='o',edgecolor='white',label='Training data')
79     71     plt.scatter(y_test_pred, y_test_pred-y_test,c='blue',marker='s',edgecolor='white',label='Test data')
80     72     plt.xlabel('Predicted values')
81
82~\Anaconda3\lib\site-packages\pandas\core\ops\__init__.py in f(self, other, axis, level, fill_value)
83   1486     def f(self, other, axis=default_axis, level=None, fill_value=None):
84   1487 
85-> 1488         other = _align_method_FRAME(self, other, axis)
86   1489 
87   1490         if isinstance(other, ABCDataFrame):
88
89~\Anaconda3\lib\site-packages\pandas\core\ops\__init__.py in _align_method_FRAME(left, right, axis)
90   1425 
91   1426         if right.ndim == 1:
92-> 1427             right = to_series(right)
93   1428 
94   1429         elif right.ndim == 2:
95
96~\Anaconda3\lib\site-packages\pandas\core\ops\__init__.py in to_series(right)
97   1417             if len(left.columns) != len(right):
98   1418                 raise ValueError(
99-> 1419                     msg.format(req_len=len(left.columns), given_len=len(right))
100   1420                 )
101   1421             right = left._constructor_sliced(right, index=left.columns)
102
103ValueError: Unable to coerce to Series, length must be 1: given 30
104
105<Figure size 576x288 with 0 Axes>

ここで、私としては、y_train_predとy_test_predのshapeが、リッジ回帰のときとなぜか異なる形となっていることに気付きました。
リッジ回帰の場合は[[a,b,c,...]]と2次元なっており、ロッソ回帰の場合は[a,b,c,...]と1次元なっていました。

このため、結論としては上記コードでコメントアウトしている2行（#で示しています）を入れて、これらを2次元配列にすると治ることが分かりました。
ここで詳しい方に質問なのですが、

１．なぜロッソ回帰をやると次元が1次元になってしまうのでしょうか？
２．なぜ2次元配列にしないと散布図が書けないのでしょうか？

こちらどなたかお分かりになられる方、ご教示いただけないでしょうか。
丸一日かけても理解ができず・・・。助けてくださいませ！

行動規範の内容に同意します

回答1件

ベストアンサー

まず最初にshapeを都度確認することを推奨します。何か変なことが起こっています。

先に簡単な方の2からお答えします。

２．なぜ2次元配列にしないと散布図が書けないのでしょうか？

そんなことはありません。テストコードで確認したところ、１次元配列であっても散布図は描けます。

python
1import matplotlib.pyplot as plt
2import numpy as np
3
4x = np.array([1, 2, 3])
5y = np.array([4, 5, 6])
6xx = np.array([7, 8, 9]).reshape(-1, 1)
7yy = np.array([10, 11, 12]).reshape(-1, 1)
8
9print(x.shape, y.shape)
10print(xx.shape, yy.shape)
11
12plt.scatter(x, y, label="X:1D Y:1D")
13plt.scatter(xx, y, label="X:2D Y:1D")
14plt.scatter(x, yy, label="X:1D Y:2D")
15plt.scatter(xx, yy, label="X:2D Y:2D")
16
17plt.legend()
18plt.show()

実行結果

terminal
1(3,) (3,)
2(3, 1) (3, 1)

１．なぜロッソ回帰をやると次元が1次元になってしまうのでしょうか？

これも次元は維持されます。LassoとRidgeで次元が異なるとしたら、(断言はできませんが)何かおかしなことが起こっているように思います。少なくともテストコードでは、次元は一致しました。

python
1import numpy as np
2import matplotlib.pyplot as plt
3from sklearn.linear_model import Lasso, Ridge
4from sklearn.model_selection import train_test_split
5
6
7def func(x):
8    a = 10.0
9    b = 5.0
10    y = a * x + b
11    noise = np.random.randn(len(y))
12    return y + noise
13
14
15def main():
16    x = np.linspace(0, 10, 100)  # x.shape == (100,)
17    y = func(x)  # y.shape == (100,)
18
19    # Fittingのために (n_sample, 1) にする
20    x = x.reshape(-1, 1)  # x.shape == (100, 1)
21
22    # 訓練データと試験データの分離
23    x_train, x_test, y_train, y_test = train_test_split(
24        x, y, test_size=0.33, random_state=42
25    )
26    # x_train.shape == (67, 1)
27    # y_train.shape == (67,)
28    # x_test.shape == (33, 1)
29    # y_test.shape == (33,)
30
31    # Lasso
32    # (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
33    clf_lasso = Lasso(alpha=0.1)
34    clf_lasso.fit(x_train, y_train)
35    y_test_lasso = clf_lasso.predict(x_test)  # y_test_lasso.shape == (33,)
36
37    # Ridge
38    # ||y - Xw||^2_2 + alpha * ||w||^2_2
39    clf_ridge = Ridge(alpha=0.1)
40    clf_ridge.fit(x, y)
41    y_test_ridge = clf_ridge.predict(x_test)  # y_test_ridge == (33,)
42
43    # Plot
44    plt.scatter(x_test, y_test, marker=".", label="Test data", color="red")
45    plt.scatter(x_test, y_test_lasso, marker=".", label="Lasso", color="blue")
46    plt.scatter(x_test, y_test_ridge, marker=".", label="Ridge", color="green")
47    plt.legend()
48    plt.show()
49
50
51if __name__ == "__main__":
52    main()