質問編集履歴
2
コードの改善
title
CHANGED
File without changes
|
body
CHANGED
@@ -15,9 +15,53 @@
|
|
15
15
|
### 該当のソースコード
|
16
16
|
|
17
17
|
```python
|
18
|
+
import MeCab
|
19
|
+
from gensim.corpora.dictionary import Dictionary
|
20
|
+
from gensim.models import LdaModel
|
21
|
+
from gensim.models import HdpModel
|
22
|
+
from collections import defaultdict
|
23
|
+
|
24
|
+
# MeCabオブジェクトの生成
|
25
|
+
mt = MeCab.Tagger('')
|
26
|
+
|
27
|
+
mt.parse('')
|
28
|
+
|
29
|
+
# トピック数の設定
|
30
|
+
NUM_TOPICS = 3
|
31
|
+
hdp_num_topics = 10
|
32
|
+
|
33
|
+
if __name__ == "__main__":
|
34
|
+
# トレーニングデータの読み込み
|
35
|
+
# train_texts は二次元のリスト
|
36
|
+
# テキストデータを一件ずつ分かち書き(名詞、動詞、形容詞に限定)して train_texts に格納するだけ
|
37
|
+
train_texts = []
|
38
|
+
with open('train.txt', 'r',encoding='utf-8') as f:
|
39
|
+
for line in f:
|
40
|
+
text = []
|
41
|
+
node = mt.parseToNode(line.strip())
|
42
|
+
while node:
|
43
|
+
fields = node.feature.split(",")
|
44
|
+
if fields[0] == '名詞':
|
45
|
+
text.append(node.surface)
|
46
|
+
node = node.next
|
47
|
+
train_texts.append(text)
|
48
|
+
words = Dictionary(train_texts)
|
49
|
+
print(words)
|
50
|
+
|
51
|
+
from gensim import corpora
|
52
|
+
|
53
|
+
# words はさっきの単語リスト
|
54
|
+
dictionary = corpora.Dictionary(train_texts)
|
55
|
+
print(dictionary.token2id)
|
56
|
+
|
57
|
+
# no_above: 使われてる文章の割合がno_above以上の場合無視
|
58
|
+
dictionary.filter_extremes(no_below=20, no_above=0.3)
|
59
|
+
|
60
|
+
dictionary.save_as_text('train.txt')
|
18
61
|
words = bytes('words', 'UTF-8')
|
19
|
-
dictionary = corpora.Dictionary.load_from_text('
|
62
|
+
dictionary = corpora.Dictionary.load_from_text('train.txt')
|
20
63
|
type(words)
|
64
|
+
|
21
65
|
vec = dictionary.doc2bow(words)
|
22
66
|
print(vec)
|
23
67
|
```
|
1
誤字
title
CHANGED
File without changes
|
body
CHANGED
@@ -8,15 +8,6 @@
|
|
8
8
|
TypeError Traceback (most recent call last)
|
9
9
|
<ipython-input-55-b3b1a98eecac> in <module>
|
10
10
|
----> 1 vec = dictionary.doc2bow(words)
|
11
|
-
2 print(vec)
|
12
|
-
|
13
|
-
~\Anaconda3\lib\site-packages\gensim\corpora\dictionary.py in doc2bow(self, document, allow_update, return_missing)
|
14
|
-
251 counter = defaultdict(int)
|
15
|
-
252 for w in document:
|
16
|
-
--> 253 counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
|
17
|
-
254
|
18
|
-
255 token2id = self.token2id
|
19
|
-
|
20
11
|
TypeError: decoding to str: need a bytes-like object, int found
|
21
12
|
```
|
22
13
|
|