質問編集履歴
2
コードの改善
test
CHANGED
File without changes
|
test
CHANGED
@@ -32,11 +32,99 @@
|
|
32
32
|
|
33
33
|
```python
|
34
34
|
|
35
|
+
import MeCab
|
36
|
+
|
37
|
+
from gensim.corpora.dictionary import Dictionary
|
38
|
+
|
39
|
+
from gensim.models import LdaModel
|
40
|
+
|
41
|
+
from gensim.models import HdpModel
|
42
|
+
|
43
|
+
from collections import defaultdict
|
44
|
+
|
45
|
+
|
46
|
+
|
47
|
+
# MeCabオブジェクトの生成
|
48
|
+
|
49
|
+
mt = MeCab.Tagger('')
|
50
|
+
|
51
|
+
|
52
|
+
|
53
|
+
mt.parse('')
|
54
|
+
|
55
|
+
|
56
|
+
|
57
|
+
# トピック数の設定
|
58
|
+
|
59
|
+
NUM_TOPICS = 3
|
60
|
+
|
61
|
+
hdp_num_topics = 10
|
62
|
+
|
63
|
+
|
64
|
+
|
65
|
+
if __name__ == "__main__":
|
66
|
+
|
67
|
+
# トレーニングデータの読み込み
|
68
|
+
|
69
|
+
# train_texts は二次元のリスト
|
70
|
+
|
71
|
+
# テキストデータを一件ずつ分かち書き(名詞、動詞、形容詞に限定)して train_texts に格納するだけ
|
72
|
+
|
73
|
+
train_texts = []
|
74
|
+
|
75
|
+
with open('train.txt', 'r',encoding='utf-8') as f:
|
76
|
+
|
77
|
+
for line in f:
|
78
|
+
|
79
|
+
text = []
|
80
|
+
|
81
|
+
node = mt.parseToNode(line.strip())
|
82
|
+
|
83
|
+
while node:
|
84
|
+
|
85
|
+
fields = node.feature.split(",")
|
86
|
+
|
87
|
+
if fields[0] == '名詞':
|
88
|
+
|
89
|
+
text.append(node.surface)
|
90
|
+
|
91
|
+
node = node.next
|
92
|
+
|
93
|
+
train_texts.append(text)
|
94
|
+
|
95
|
+
words = Dictionary(train_texts)
|
96
|
+
|
97
|
+
print(words)
|
98
|
+
|
99
|
+
|
100
|
+
|
101
|
+
from gensim import corpora
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
# words はさっきの単語リスト
|
106
|
+
|
107
|
+
dictionary = corpora.Dictionary(train_texts)
|
108
|
+
|
109
|
+
print(dictionary.token2id)
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
# no_above: 使われてる文章の割合がno_above以上の場合無視
|
114
|
+
|
115
|
+
dictionary.filter_extremes(no_below=20, no_above=0.3)
|
116
|
+
|
117
|
+
|
118
|
+
|
119
|
+
dictionary.save_as_text('train.txt')
|
120
|
+
|
35
121
|
words = bytes('words', 'UTF-8')
|
36
122
|
|
37
|
-
dictionary = corpora.Dictionary.load_from_text('
|
123
|
+
dictionary = corpora.Dictionary.load_from_text('train.txt')
|
38
124
|
|
39
125
|
type(words)
|
126
|
+
|
127
|
+
|
40
128
|
|
41
129
|
vec = dictionary.doc2bow(words)
|
42
130
|
|
1
誤字
test
CHANGED
File without changes
|
test
CHANGED
@@ -17,24 +17,6 @@
|
|
17
17
|
<ipython-input-55-b3b1a98eecac> in <module>
|
18
18
|
|
19
19
|
----> 1 vec = dictionary.doc2bow(words)
|
20
|
-
|
21
|
-
2 print(vec)
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
~\Anaconda3\lib\site-packages\gensim\corpora\dictionary.py in doc2bow(self, document, allow_update, return_missing)
|
26
|
-
|
27
|
-
251 counter = defaultdict(int)
|
28
|
-
|
29
|
-
252 for w in document:
|
30
|
-
|
31
|
-
--> 253 counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
|
32
|
-
|
33
|
-
254
|
34
|
-
|
35
|
-
255 token2id = self.token2id
|
36
|
-
|
37
|
-
|
38
20
|
|
39
21
|
TypeError: decoding to str: need a bytes-like object, int found
|
40
22
|
|