x、x2、yのデータセットがあり
X、y回帰の問題で、ハイパーパラメータを決めるためのクロスバリデーションを行う際、
xはオートスケーリング、x2は処理なしでX=x+x2として最終的にyと回帰したいのですが、
以下のように試したところ、X=x+x2の段階でエラーがでてしまいます。
おそらく、xとx2をランダムに抜きだすときのデータナンバーがあっていないのだと思いますが、二つのデータフレームからランダムかつデータフレーム間は同じサンプルナンバーを抜き出すにはどうしたらよいでしょうか。
x:
No. | a | b |
---|---|---|
1 | 3.1 | 1.2 |
2 | 10.0 | 2.2 |
3 | 1.1 | 1.2 |
4 | 3.5 | 3.2 |
x2:
No. | c | d |
---|---|---|
1 | 0 | 1 |
2 | 0 | 0 |
3 | 0 | 1 |
4 | 1 | 0 |
ランダムに抜き出す際に、x,x2両データのデータNo.だけは統一したいです。
def double_cross_validation(gs_cv, x, x2, y, outer_fold_number, do_autoscaling=True, random_state=0): """ Double Cross-Validation (DCV) Estimate y-values in DCV Parameters ---------- gs_cv : object of GridSearchCV (sklearn.model_selection.GridSearchCV) for more details, please go to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html x : numpy.array or pandas.DataFrame m x n matrix of X-variables of training data, m is the number of training sammples and n is the number of X-variables y : numpy.array or pandas.DataFrame m x 1 vector of a Y-variable of training data outer_fold_number : int Fold number in outer CV (fold number in inner CV is included in gs_cv) do_autoscaling : bool flag of autoscaling, if True, do autoscaling random_state : int random seed, if None, random seed is not set Returns ------- estimated_y : numpy.array estimated y-values in DCV """ x = np.array(x) x2 = np.array(x2) y = np.array(y) # how to divide datase in outer CV min_number = math.floor(x.shape[0] / outer_fold_number) mod_number = x.shape[0] - min_number * outer_fold_number index = np.matlib.repmat(np.arange(1, outer_fold_number + 1, 1), 1, min_number).ravel() if mod_number != 0: index = np.r_[index, np.arange(1, mod_number + 1, 1)] if random_state != None: np.random.seed(random_state) fold_index_in_outer_cv = np.random.permutation(index) np.random.seed(0) estimated_y = np.zeros(len(y)) for fold_number_in_outer_cv in np.arange(1, outer_fold_number + 1, 1): print(fold_number_in_outer_cv, '/', outer_fold_number) # divide training data and test data x_train = x[fold_index_in_outer_cv != fold_number_in_outer_cv, :].copy() x2_train = x2[fold_index_in_outer_cv != fold_number_in_outer_cv, :].copy() y_train = y[fold_index_in_outer_cv != fold_number_in_outer_cv].copy() x_test = x[fold_index_in_outer_cv == fold_number_in_outer_cv, :].copy() x2_test = x2[fold_index_in_outer_cv == fold_number_in_outer_cv, :].copy() # shuffle samples if random_state != -999: np.random.seed(0) random_numbers = np.random.permutation(np.arange(x_train.shape[0])) x_train = x_train[random_numbers, :] x2_train = x2_train[random_numbers, :] y_train = y_train[random_numbers] np.random.seed(0) # autoscaling if do_autoscaling: autoscaled_x_train_pre = (x_train - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) autoscaled_x_train =np.concatenate([autoscaled_x_train_pre, x2_train],axis=1) autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1) autoscaled_x_test_pre = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1) autoscaled_x_train =np.concatenate([[autoscaled_x_test_pre], [x2_test]], axis=1) else: autoscaled_x_train = x_train.copy() autoscaled_y_train = y_train.copy() autoscaled_x_test = x_test.copy() # inner CV gs_cv.fit(autoscaled_x_train, autoscaled_y_train) # modeling model = getattr(gs_cv, 'estimator') hyperparameters = list(gs_cv.best_params_.keys()) for hyperparameter in hyperparameters: setattr(model, hyperparameter, gs_cv.best_params_[hyperparameter]) model.fit(autoscaled_x_train, autoscaled_y_train) # prediction estimated_y_test = np.ndarray.flatten(model.predict(autoscaled_x_test)) if do_autoscaling: estimated_y_test = estimated_y_test * y_train.std(ddof=1) + y_train.mean() estimated_y[fold_index_in_outer_cv == fold_number_in_outer_cv] = estimated_y_test # 格納 return estimated_y # Settings inner_fold_number = 10 # "fold_number"-fold cross-validation (CV) for inter CV outer_fold_number = 54 # "fold_number"-fold CV for outer CV parameters = { "C": [2**n for n in range(-5, 11)], # Candidates of C "epsilon": [2**n for n in range(-10, 1)], # Candidates of epsilon "gamma": [2**n for n in range(-20, 11)], # Candidates of gamma } # DCV inner_cv = GridSearchCV(svm.SVR(), parameters, scoring = 'max_error', n_jobs = -1, cv = inner_fold_number) y_pred = double_cross_validation(gs_cv=inner_cv, x=x, x2 =x2, y=y, outer_fold_number=outer_fold_number, do_autoscaling=True, random_state=0) # 汎化性能 from DCV using SVR from sklearn.metrics import r2_score from sklearn.metrics import mean_absolute_error print(r2_score(y, y_pred)) print(mean_absolute_error(y, y_pred))
エラー:
ValueError Traceback (most recent call last)
<ipython-input-40-de7659de5d58> in <module>
99 # DCV
100 inner_cv = GridSearchCV(svm.SVR(), parameters, scoring = 'max_error', n_jobs = -1, cv = inner_fold_number)
--> 101 y_pred = double_cross_validation(gs_cv=inner_cv, x=x, x2 =x2, y=y, outer_fold_number=outer_fold_number, do_autoscaling=True, random_state=0)
102
103 # 汎化性能 from DCV using SVR
<ipython-input-40-de7659de5d58> in double_cross_validation(gs_cv, x, x2, y, outer_fold_number, do_autoscaling, random_state)
66 autoscaled_y_train = (y_train - y_train.mean()) / y_train.std(ddof=1)
67 autoscaled_x_test_pre = (x_test - x_train.mean(axis=0)) / x_train.std(axis=0, ddof=1)
---> 68 autoscaled_x_train =np.concatenate([[autoscaled_x_test_pre], [x2_test]], axis=1)
69 else:
70 autoscaled_x_train = x_train.copy()
ValueError: all the input array dimensions except for the concatenation axis must match exactly
よろしくお願いします。
回答1件
あなたの回答
tips
プレビュー