重回帰分析のマハラノビス距離でエラー

Pythonのjupyter notebookで、UCI machine learning repositoryのあるデータ（'flare.data2.txt'）を重回帰分析しております。自作関数によりStepAICで変数選択を行ったのち、信頼区間・予測区間を求めようとすると、エラーが出ます。同じ手順で、"abalone.data.txt"というデータを分析した際には全く問題が無いのですが、なぜかわからずにおります。
エラーが出ているのは、マハラノビスの距離を求めるところで、分散共分散行列の逆行列がNANになってしまっている（下記の"mahara is..."以下の出力）ためなのですが、なぜそのようになるのかが分かりません。
引数のベクトルの値は、ちゃんと入っているようですし、"abalone.data.txt"分析時のデバッグでも、ほぼ同様にベクトルの値が出力され、行列式の値もほぼ近いのですが、不思議なことに、その場合はちゃんと分散共分散行列の逆行列が求まっています。。。変数vecの値の入り方が違うのでしょうか？

【環境】Window10 64bit, chrome

Python
1
2def step_aic(model, exog, endog, **kwargs):
3    """
4    This select the best exogenous variables with AIC
5    Both exog and endog values can be either str or list.
6    (Endog list is for the Binomial family.)
7
8    Note: This adopt only "forward" selection
9
10    Args:
11        model: model from statsmodels.formula.api
12        exog (str or list): exogenous variables
13        endog (str or list): endogenous variables
14        kwargs: extra keyword argments for model (e.g., data, family)
15
16    Returns:
17        model: a model that seems to have the smallest AIC
18    """
19
20    # exog, endogは強制的にリスト形式に変換しておく
21    exog = np.r_[[exog]].flatten()
22    endog = np.r_[[endog]].flatten()
23    remaining = set(exog)
24    selected = []  # 採用が確定された要因
25
26    # 定数項のみのAICを計算
27    formula_head = ' + '.join(endog) + ' ~ '
28    formula = formula_head + '1'
29    aic = model(formula=formula, **kwargs).fit().aic
30    print('AIC: {}, formula: {}'.format(round(aic, 3), formula))
31
32    current_score, best_new_score = np.ones(2) * aic
33
34    # 全要因を採択するか，どの要因を追加してもAICが上がらなければ終了
35    while remaining and current_score == best_new_score:
36        scores_with_candidates = []
37        for candidate in remaining:
38
39            # 残っている要因を1つずつ追加したときのAICを計算
40            formula_tail = ' + '.join(selected + [candidate])
41            formula = formula_head + formula_tail
42            aic = model(formula=formula, **kwargs).fit().aic
43            print('AIC: {}, formula: {}'.format(round(aic, 3), formula))
44
45            scores_with_candidates.append((aic, candidate))
46
47        # 最もAICが小さかった要因をbest_candidateとする
48        scores_with_candidates.sort()
49        scores_with_candidates.reverse()
50        best_new_score, best_candidate = scores_with_candidates.pop()
51
52        # 候補要因追加でAICが下がったならば，それを確定要因として追加する
53        if best_new_score < current_score:
54            remaining.remove(best_candidate)
55            selected.append(best_candidate)
56            current_score = best_new_score
57
58    formula = formula_head + ' + '.join(selected)
59    print('The best formula: {}'.format(formula))
60    return model(formula, **kwargs).fit()
61
62
63import pandas as pd
64import numpy as np
65import scipy as sp
66import scipy.stats as stats
67import statsmodels.api as sm
68import statsmodels.formula.api as smf
69
70d = pd.read_csv('flare.data2.txt',header=None,skiprows=1,delim_whitespace=True,names=('x1','x2','x3','x4','x5','x6','x7','x8','x9','x10','y1','y2','y3'))
71d.head()
72df2 = pd.get_dummies(d)
73model = step_aic(smf.ols,['x4','x5','x6','x7','x8','x9','x10','x1_B','x1_C','x1_D','x1_E','x1_F','x1_H','x2_A','x2_H','x2_K','x2_R','x2_S','x2_X','x3_C','x3_I','x3_O','x3_X'],['y1'],data=df2)
74
75
76
77import pandas as pd
78import numpy as np
79import scipy as sp
80import scipy.stats as stats
81
82y=df2['y1']
83X=df2[['x1_E', 'x3_I', 'x2_K', 'x4', 'x1_F', 'x1_D', 'x9', 'x6', 'x1_C']]
84
85import statsmodels.api as sm
86mod = sm.OLS(y, sm.add_constant(X))
87res = mod.fit()
88print(res.summary())
89
90
91Sigma = sp.asmatrix(df2[['x1_E', 'x3_I', 'x2_K', 'x4', 'x1_F', 'x1_D', 'x9', 'x6', 'x1_C']].cov().as_matrix())
92Sigma
93pp = sp.linalg.det(Sigma)
94print("det is",pp)
95
96
97def Mahala2(vec_x, vec_mean, mat):
98    length = mat.shape[0]
99    vec = sp.asmatrix((vec_x - vec_mean).values.reshape(length, 1))
100    print("vec_x is",vec_x)
101    print("vec_mean is",vec_mean)
102    print("vec is", vec)
103    inv = sp.linalg.inv(mat)
104    pp = sp.linalg.det(mat)
105    print("det is",pp)
106    mahala2 = vec.T.dot(inv.dot(vec))
107    print("mahara is",mahala2)
108    return mahala2[0, 0]
109
110
111a1E = df2[['x1_E']].mean() * 1.01
112a3I = df2[['x3_I']].mean() * 1.01
113
114a2K = df2[['x2_K']].mean() * 1.01
115a4 = df2[['x4']].mean() * 1.01
116
117a1F = df2[['x1_F']].mean() * 1.01
118a1D = df2[['x1_D']].mean() * 1.01
119
120a9 = df2[['x9']].mean() * 1.01
121a6 = df2[['x6']].mean() * 1.01
122
123a1C = df2[['x1_C']].mean() * 1.01
124
125import matplotlib.pyplot as plt
126%matplotlib inline
127
128import scipy as sp
129
130hl_l = sp.linspace(0, 4, 100)
131hat_y = []
132# x6以外変数固定
133(x1_E, x3_I, x2_K, x4, x1_F, x1_D, x9, x1_C) = (a1E,a3I,a2K,a4,a1F,a1D,a9,a1C)
134
135print((x1_E, x3_I, x2_K, x4, x1_F, x1_D, x9, x1_C))
136for hl in hl_l:
137    X = sp.array([1,x1_E, x3_I, x2_K, x4, x1_F, x1_D, x9, hl, x1_C])
138    hat_y.append(X.dot(res.params))
139plt.plot(hl_l, hat_y)
140n = 1066
141phi_e = n - res.df_model - 1
142t_0025 = stats.t.isf(q=0.05/2, df=phi_e)
143# マハラノビス距離の二乗
144#vec_mean = myrmesia_remNan_int[["HL", "ML", "WL", "PrW", "PpL", "GW"]].mean() # 平均ベクトル
145vec_mean = df2[['x1_E', 'x3_I', 'x2_K', 'x4', 'x1_F', 'x1_D', 'x9', 'x6', 'x1_C']].mean()
146print(vec_mean)
147D2 = []
148for hl in hl_l:
149    D2_0 = Mahala2([x1_E, x3_I, x2_K, x4, x1_F, x1_D, x9, hl, x1_C], vec_mean, Sigma)
150    D2.append(D2_0)
151    print(D2_0)
152D2 = sp.array(D2)
153
154
155interval095 = t_0025 * sp.sqrt((1/n + D2 / (n-1)) * res.scale)
156print(interval095)
157plt.plot(hl_l, hat_y - interval095, "--g", label="95% interval conf.")
158plt.plot(hl_l, hat_y + interval095, "--g")
159
160pred_interval095 = t_0025 * sp.sqrt((1 + 1/n + D2 / (n-1)) * res.scale)
161plt.plot(hl_l, hat_y - pred_interval095, "--r", label="95% prediction conf.")
162plt.plot(hl_l, hat_y + pred_interval095, "--r")
163
164
165# sample x6=2 のとき
166hl=2
167#X = sp.array([1, hl, ml, wl, prw, ppl, gw])
168X = sp.array([1, x1_E, x3_I, x2_K, x4, x1_F, x1_D, x9, hl, x1_C])
169plt.scatter(hl, X.dot(res.params), color="k", )
170
171plt.grid()

【出力】

(x1_E 0.090009
dtype: float64, x3_I 0.211285
dtype: float64, x2_K 0.043583
dtype: float64, x4 1.165385
dtype: float64, x1_F 0.040741
dtype: float64, x1_D 0.226445
dtype: float64, x9 1.035582
dtype: float64, x1_C 0.199916
dtype: float64)
x1_E 0.089118
x3_I 0.209193
x2_K 0.043152
x4 1.153846
x1_F 0.040338
x1_D 0.224203
x9 1.025328
x6 1.059099
x1_C 0.197936
dtype: float64
vec_x is [x1_E 0.090009
dtype: float64, x3_I 0.211285
dtype: float64, x2_K 0.043583
dtype: float64, x4 1.165385
dtype: float64, x1_F 0.040741
dtype: float64, x1_D 0.226445
dtype: float64, x9 1.035582
dtype: float64, 0.0, x1_C 0.199916
dtype: float64]
vec_mean is x1_E 0.089118
x3_I 0.209193
x2_K 0.043152
x4 1.153846
x1_F 0.040338
x1_D 0.224203
x9 1.025328
x6 1.059099
x1_C 0.197936
dtype: float64
vec is [[x1_E 0.000891
dtype: float64]
[x3_I 0.002092
dtype: float64]
[x2_K 0.000432
dtype: float64]
[x4 0.011538
dtype: float64]
[x1_F 0.000403
dtype: float64]
[x1_D 0.002242
dtype: float64]
[x9 0.010253
dtype: float64]
[-1.0590994371482176]
[x1_C 0.001979
dtype: float64]]
det is 3.3851611336655366e-11
mahara is [[x1_C NaN
x1_D NaN
x1_E NaN
x1_F NaN
x2_K NaN
x3_I NaN
x4 NaN
x9 NaN
dtype: float64]]

【デバッグ】

python
1コード
2``
3
4ちなみに、出力の差分としましては、うまくいっているデータの時は、
5関数Mahala2の中でデバッグ表示しているベクトルについて、
6
7vec is [[-0.40788125]
8 [ 0.00139516]
9 [ 0.00359367]
10 [ 0.00180594]
11 [ 0.00321283]]
12
13となっているのですが、NGの時は以下の様になっております。
14
15vec is [[x1_E    0.000891
16dtype: float64]
17 [x3_I    0.002092
18dtype: float64]
19 [x2_K    0.000432
20dtype: float64]
21 [x4    0.011538
22dtype: float64]
23 [x1_F    0.000403
24dtype: float64]
25 [x1_D    0.002242
26dtype: float64]
27 [x9    0.010253
28dtype: float64]
29 [-1.0186953967441772]
30 [x1_C    0.001979
31dtype: float64]]`

y_waiwai

2018/11/05 06:32

このままではコードが読めません。質問を編集して、<code>ボタンを押して、’’’の枠の中にコードを貼り付けてください

_hh

2018/11/05 08:11

ご連絡ありがとうございます。追加情報も加え修正させて頂きました。さっぱり原因が分かりません。

ikedas

2018/11/07 11:31

Mahala2の中で、invの値はどうなっていますか。

行動規範の内容に同意します

回答1件

自己解決

色々と有難うございます。def Mahala2(vec_x, vec_mean, mat):
以降で、以下の様にしたところ、解決致しました。

Python
1    vec_x = np.array(vec_x, dtype='float64')
2

投稿2018/11/29 06:21

_hh

総合スコア79

あなたの回答

tips

プレビュー

行動規範の内容に同意します

質問の解決につながる回答をしましょう。サンプルコードなど、より具体的な説明があると質問者の理解の助けになります。また、読む側のことを考えた、分かりやすい文章を心がけましょう。

15分調べてもわからないことは
teratailで質問しよう！

ただいまの回答率
85.30%

質問をまとめることで
思考を整理して素早く解決

テンプレート機能で
簡単に質問をまとめる

質問する