[W 2020-06-21 10:31:02,099] Setting status of trial#0 as TrialState.FAIL because of the following error: ValueError('Unknown label type: (sample_19 0.043\nsample_2 0.122\nsample_20 0.490\nsample_9 0.219\nsample_11 0.066\nsample_18 0.079\nName: property_a, dtype: float64,)')
Traceback (most recent call last):
File "C:\Users\owner\Anaconda3\lib\site-packages\optuna\study.py", line 734, in run_trial
result = func(trial)
File "C:\Users\owner\desktop\python_data_analysis_ohmsha-master\LightGBM_optuna.py", line 60, in objectives
score = log_loss(y_test, y_pred_valid)
File "C:\Users\owner\Anaconda3\lib\site-packages\sklearn\metrics\classification.py", line 1771, in log_loss
File "C:\Users\owner\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py", line 413, in fit
self.classes = unique_labels(y)
File "C:\Users\owner\Anaconda3\lib\site-packages\sklearn\utils\multiclass.py", line 96, in unique_labels
raise ValueError("Unknown label type: %s" % repr(ys))
ValueError: Unknown label type: (sample_19 0.043
sample_2 0.122
sample_20 0.490
sample_9 0.219
sample_11 0.066
sample_18 0.079
Name: property_a, dtype: float64,)
-- coding: utf-8 --
import numpy as np
import pandas as pd
import sample_functions
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import optuna
from sklearn.metrics import log_loss
def objectives(trial):
y_number = 0 # 0 or 1
number_of_test_samples = 6 # テストデータのサンプル数
method_name = 'pls' # 'pls' or 'svr'
add_nonlinear_terms_flag = True # True (二乗項・交差項を追加) or False (追加しない)
dataset = pd.read_csv('virtual_resin.csv', index_col=0) y = dataset.iloc[:, y_number] # 目的変数 x = dataset.iloc[:, 2:] # 説明変数 x_train_tmp, x_test_tmp, y_train, y_test = train_test_split(x, y, test_size=number_of_test_samples, random_state=0) x_train_tmp = sample_functions.add_nonlinear_terms(x_train_tmp) # 説明変数の二乗項や交差項を追加 x_test_tmp = sample_functions.add_nonlinear_terms(x_test_tmp) x_train = x_train_tmp.drop(x_train_tmp.columns[x_train_tmp.std() == 0], axis=1) # 標準偏差が 0 の説明変数を削除 x_test = x_test_tmp.drop(x_train_tmp.columns[x_train_tmp.std() == 0], axis=1) # オートスケーリング autoscaled_x_train = (x_train - x_train.mean()) / x_train.std() autoscaled_y_train = (y_train - y_train.mean()) / y_train.std() autoscaled_x_test = (x_test - x_train.mean()) / x_train.std() autoscaled_y_test = (y_test - y_train.mean()) / y_train.std() params = { 'objective': 'regression', 'learning_rate': 0.05, 'max_depth': trial.suggest_int('max_bin', 1, 50), # 木の数 (負の値で無制限) 'num_leaves': trial.suggest_int('num_leaves', 2, 20), } lgb_train = lgb.Dataset(autoscaled_x_train, autoscaled_y_train) lgb_eval = lgb.Dataset(autoscaled_x_test, autoscaled_y_test, reference=lgb_train) model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], verbose_eval=10, num_boost_round=1000, early_stopping_rounds=10) y_pred_valid = model.predict(autoscaled_x_test, num_iteration=model.best_iteration)* y_train.std() + y_train.mean() score = log_loss(y_test, y_pred_valid) return score
study = optuna.create_study(sampler=optuna.samplers.RandomSampler(seed=0))
study.optimize(objectives, n_trials=40)
sample_functions.add_nonlinear_termsは# 説明変数の二乗項や交差項を追加する関数です。
property_a property_b raw_material_1 raw_material_2 raw_material_3 temperature time
sample_1 0.125 0.707 0.5 0.1 0.4 85 80
sample_2 0.122 0.464 0.7 0 0.3 55 50
sample_3 0.624 0.683 0 0.2 0.8 70 40
sample_4 0.042 0.262 0.9 0.1 0 60 90
sample_5 0.589 0.554 0.2 0 0.8 75 120
sample_6 0.051 0.54 0.7 0.1 0.2 90 60
sample_7 0.771 0.459 0.1 0 0.9 80 10
sample_8 0.775 0.514 0.1 0 0.9 90 90
sample_9 0.219 0.741 0.4 0.1 0.5 100 110
sample_10 0.12 0.762 0.5 0.2 0.3 60 40
sample_11 0.066 0.388 0.8 0.1 0.1 50 10
sample_12 0.037 0.413 0.8 0.1 0.1 65 40
sample_13 0.1 0.788 0.5 0.2 0.3 60 80
sample_14 0.161 0.783 0.5 0.4 0.1 90 40
sample_15 0.773 0.519 0 0.1 0.9 50 10
sample_16 0.087 0.788 0.5 0.3 0.2 55 50
sample_17 0.511 0.786 0 0.3 0.7 80 20
sample_18 0.079 0.031 1 0 0 70 60
sample_19 0.043 0.426 0.8 0.1 0.1 100 100
sample_20 0.49 0.59 0.3 0 0.7 60 10