Jupyter notebook上でPythonを実行しています。
交差検証を行い、True Positive rateを求め、その値をプロットしたいです。
#レポート課題 #データの読み込み import numpy as np import pandas as pd #読み込みと削除 pima_tr = pd.read_csv('data2/pima_tr.csv' , encoding='UTF-8' , index_col=0) pima_te = pd.read_csv('data2/pima_te.csv' , encoding='UTF-8' , index_col=0) #結合 group_data = pd.concat([pima_tr, pima_te], ignore_index = True) #モジュール読み込み from sklearn import preprocessing from sklearn.model_selection import train_test_split from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import confusion_matrix # データのスケーリングとtrainデータとtestデータに分ける X = preprocessing.scale(group_data[["npreg","glu","bp","skin","bmi","ped","age"]]) y = group_data.type X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7) knn = KNeighborsClassifier() knn.fit(X_train, y_train) from sklearn import linear_model clf = linear_model.LogisticRegression() neighbors = list(range(2, 50)) mean_score= list() for k in neighbors: scores = cross_val_score(clf, X, y, cv=k) mean_score.append(np.mean(scores)) y_pred = knn.predict(X_train) cmat = confusion_matrix(y_train, y_pred) #True Positive rateの計算 tpr=cmat[1,0]/(cmat[1,0]+cmat[1,1]) tpr_scores.append(tpr) #最適なk(True Positive rateの計算が最大)の表示 import matplotlib.pyplot as plt %matplotlib inline optimal_k = neighbors[tpr_scores.index(max(filter(lambda v: v <1 , tpr_scores)))] print("The best number of k is %d." % optimal_k) # 結果の可視化 fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(neighbors, tpr_scores) ax.set_xlabel('Number of Neighbors K') ax.set_ylabel('True Positive rate')
しかし、このエラーが出てしまい、プロットできません。
x and y must have same first dimension, but have shapes (48,) and (96,)
どこがおかしいのでしょうか、よろしくお願いいたします。
回答1件
あなたの回答
tips
プレビュー