ある映画の感想が入っているテキストのデータセットから感情分析を行いたいのですが以下のエラーが出て行き詰りました.どなたか教えてください.
試したことはGridSearchCVのn_jobsを-1にすることを推奨されていたのでためしてみたのですが,うまくいきませんでした.エラー文とコードを載せます.
python
1import pyprind 2import pandas as pd 3import os 4import numpy as np 5import re #正規表現 re (regular expression) 6from nltk.stem.porter import PorterStemmer 7import nltk 8 9from sklearn.model_selection import GridSearchCV 10from sklearn.pipeline import Pipeline 11from sklearn.linear_model import LogisticRegression 12from sklearn.feature_extraction.text import TfidfVectorizer 13 14 15basepath = 'aclImdb' 16labels = {'pos':1,'neg':0} 17pbar = pyprind.ProgBar(5000) 18 19df=pd.DataFrame() 20 21for s in ('test','train'): 22 for l in ('pos','neg'): 23 path = os.path.join(basepath,s,l) 24 for file in os.listdir(path): 25 with open(os.path.join(path,file),'r',encoding = 'utf-8') as infile: 26 txt = infile.read() 27 df = df.append([[txt,labels[l]]],ignore_index=True) 28 pbar.update() 29print('done') 30df.columns = ['review','sentiment'] 31 32np.random.seed(0) 33df = df.reindex(np.random.permutation(df.index)) 34df.to_csv('movie_data.csv',index=False,encoding='utf-8') 35 36df = pd.read_csv('movie_data.csv',encoding = 'utf-8') 37 38def preprocessor(text): 39 text = re.sub('<[^>]*>','',text) # <[^>]*> でHTMLタグを削除 40 emoticons = re.findall('(?::|;|=)(?:-)?(?:)|(|D|P)',text) 41 text = (re.sub('[\W]+',' ',text.lower()) + ''.join(emoticons).replace('-','')) 42 return text 43 44def tokenizer(text): 45 return text.split() 46 47porter = PorterStemmer() 48def tokenizer_porter(text): 49 return [porter.stem(word) for word in text.split()] 50from nltk.corpus import stopwords 51stop = stopwords.words('english') 52[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] 53if w not in stop] 54 55X_train = df.loc[:25000,'review'].values 56y_train = df.loc[:25000,'sentiment'].values 57X_test = df.loc[25000:,'review'].values 58y_test = df.loc[25000:,'sentiment'].values 59 60tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None,) 61param_grid = [{'vect_ngram_range': [(1,1)], 62 'vect_stop_words': [stop,None], 63 'vect_tokenizer' : [tokenizer,tokenizer_porter], 64 'clf_penalty' : ['11','12'], 65 'clf_C' : [1.0,10.0,100.0]}, 66 { 67 'vect_ngram_range' : [(1,1)], 68 'vect_stop_words' : [stop,None], 69 'vect_tokenizer' : [str.split], 70 'vect_use_idf' : [False], 71 'vect_norm' : [None], 72 'clf_penalty' : ['11','12'], 73 'clf_C' : [1.0,10.0,100.0] 74 } 75 ] 76lr_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))]) 77gs_lr_tfidf = GridSearchCV(lr_tfidf,param_grid,scoring='accuracy', 78 cv=5,verbose=1,n_jobs=-1) 79gs_lr_tfidf.fit(X_train,y_train) 80
erro文
1ValueError: Invalid parameter clf_C for estimator Pipeline(memory=None, 2 steps=[('vect', 3 TfidfVectorizer(analyzer='word', binary=False, 4 decode_error='strict', 5 dtype=<class 'numpy.float64'>, 6 encoding='utf-8', input='content', 7 lowercase=False, max_df=1.0, max_features=None, 8 min_df=1, ngram_range=(1, 1), norm='l2', 9 preprocessor=None, smooth_idf=True, 10 stop_words=None, strip_accents=None, 11 sublinear_tf=False, 12 token_pattern='(?u)\b\w\w+\b', 13 tokenizer=None, use_idf=True, 14 vocabulary=None)), 15 ('clf', 16 LogisticRegression(C=1.0, class_weight=None, dual=False, 17 fit_intercept=True, intercept_scaling=1, 18 l1_ratio=None, max_iter=100, 19 multi_class='warn', n_jobs=None, 20 penalty='l2', random_state=0, solver='warn', 21 tol=0.0001, verbose=0, warm_start=False))], 22 verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.
回答1件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。
2019/08/21 00:18