質問編集履歴
1
tf_idfs = vectorizer.fit_transform(training_docs)に修正しました。(変更前はwords(df.ix[i,"titlebeginning"]))
title
CHANGED
File without changes
|
body
CHANGED
@@ -54,7 +54,8 @@
|
|
54
54
|
|
55
55
|
for i in range(0,len(df)):
|
56
56
|
vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\b\w+\b')
|
57
|
-
tf_idfs = vectorizer.fit_transform(
|
57
|
+
tf_idfs = vectorizer.fit_transform(training_docs)
|
58
|
+
print(tf_idfs)
|
58
59
|
```
|
59
60
|
|
60
61
|
以上の処理を行い、最終的には、以下のコードにtf-idf処理を施して重要度の低い単語を除いたtraining_docsを代入したいのですが、どのようにしたら良いのでしょうか。
|
@@ -66,4 +67,51 @@
|
|
66
67
|
model.docvecs.similarity(0,1551)
|
67
68
|
```
|
68
69
|
|
70
|
+
エラー内容
|
71
|
+
```python
|
72
|
+
AttributeError Traceback (most recent call last)
|
73
|
+
<ipython-input-32-a49b5702c1c0> in <module>()
|
74
|
+
4 for i in range(0,len(df)):
|
75
|
+
5 vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\b\w+\b')
|
76
|
+
----> 6 tf_idfs = vectorizer.fit_transform(training_docs)
|
77
|
+
7 print(tf_idfs)
|
78
|
+
|
79
|
+
~/anaconda3/envs/kenkyuu/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
|
80
|
+
1379 Tf-idf-weighted document-term matrix.
|
81
|
+
1380 """
|
82
|
+
-> 1381 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
|
83
|
+
1382 self._tfidf.fit(X)
|
84
|
+
1383 # X is already a transformed view of raw_documents so
|
85
|
+
|
86
|
+
~/anaconda3/envs/kenkyuu/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in fit_transform(self, raw_documents, y)
|
87
|
+
867
|
88
|
+
868 vocabulary, X = self._count_vocab(raw_documents,
|
89
|
+
--> 869 self.fixed_vocabulary_)
|
90
|
+
870
|
91
|
+
871 if self.binary:
|
92
|
+
|
93
|
+
~/anaconda3/envs/kenkyuu/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in _count_vocab(self, raw_documents, fixed_vocab)
|
94
|
+
790 for doc in raw_documents:
|
95
|
+
791 feature_counter = {}
|
96
|
+
--> 792 for feature in analyze(doc):
|
97
|
+
793 try:
|
98
|
+
794 feature_idx = vocabulary[feature]
|
99
|
+
|
100
|
+
~/anaconda3/envs/kenkyuu/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(doc)
|
101
|
+
264
|
102
|
+
265 return lambda doc: self._word_ngrams(
|
103
|
+
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
|
104
|
+
267
|
105
|
+
268 else:
|
106
|
+
|
107
|
+
~/anaconda3/envs/kenkyuu/lib/python3.6/site-packages/sklearn/feature_extraction/text.py in <lambda>(x)
|
108
|
+
230
|
109
|
+
231 if self.lowercase:
|
110
|
+
--> 232 return lambda x: strip_accents(x.lower())
|
111
|
+
233 else:
|
112
|
+
234 return strip_accents
|
113
|
+
|
114
|
+
AttributeError: 'TaggedDocument' object has no attribute 'lower'
|
115
|
+
```
|
116
|
+
|
69
117
|
[こちらが問題のファイルになります](https://www.dropbox.com/s/auixihg8n344voz/%E3%82%BF%E3%82%A4%E3%83%88%E3%83%AB%E3%81%A8%E5%86%92%E9%A0%AD%E4%B8%80%E8%A6%A7.csv?dl=0)
|