質問編集履歴

2

コードの改善

2019/07/18 11:25

投稿

jyon
jyon

スコア13

test CHANGED
File without changes
test CHANGED
@@ -32,11 +32,99 @@
32
32
 
33
33
  ```python
34
34
 
35
+ import MeCab
36
+
37
+ from gensim.corpora.dictionary import Dictionary
38
+
39
+ from gensim.models import LdaModel
40
+
41
+ from gensim.models import HdpModel
42
+
43
+ from collections import defaultdict
44
+
45
+
46
+
47
+ # MeCabオブジェクトの生成
48
+
49
+ mt = MeCab.Tagger('')
50
+
51
+
52
+
53
+ mt.parse('')
54
+
55
+
56
+
57
+ # トピック数の設定
58
+
59
+ NUM_TOPICS = 3
60
+
61
+ hdp_num_topics = 10
62
+
63
+
64
+
65
+ if __name__ == "__main__":
66
+
67
+ # トレーニングデータの読み込み
68
+
69
+ # train_texts は二次元のリスト
70
+
71
+ # テキストデータを一件ずつ分かち書き(名詞、動詞、形容詞に限定)して train_texts に格納するだけ
72
+
73
+ train_texts = []
74
+
75
+ with open('train.txt', 'r',encoding='utf-8') as f:
76
+
77
+ for line in f:
78
+
79
+ text = []
80
+
81
+ node = mt.parseToNode(line.strip())
82
+
83
+ while node:
84
+
85
+ fields = node.feature.split(",")
86
+
87
+ if fields[0] == '名詞':
88
+
89
+ text.append(node.surface)
90
+
91
+ node = node.next
92
+
93
+ train_texts.append(text)
94
+
95
+ words = Dictionary(train_texts)
96
+
97
+ print(words)
98
+
99
+
100
+
101
+ from gensim import corpora
102
+
103
+
104
+
105
+ # words はさっきの単語リスト
106
+
107
+ dictionary = corpora.Dictionary(train_texts)
108
+
109
+ print(dictionary.token2id)
110
+
111
+
112
+
113
+ # no_above: 使われてる文章の割合がno_above以上の場合無視
114
+
115
+ dictionary.filter_extremes(no_below=20, no_above=0.3)
116
+
117
+
118
+
119
+ dictionary.save_as_text('train.txt')
120
+
35
121
  words = bytes('words', 'UTF-8')
36
122
 
37
- dictionary = corpora.Dictionary.load_from_text('livedoordic.txt')
123
+ dictionary = corpora.Dictionary.load_from_text('train.txt')
38
124
 
39
125
  type(words)
126
+
127
+
40
128
 
41
129
  vec = dictionary.doc2bow(words)
42
130
 

1

誤字

2019/07/18 11:25

投稿

jyon
jyon

スコア13

test CHANGED
File without changes
test CHANGED
@@ -17,24 +17,6 @@
17
17
  <ipython-input-55-b3b1a98eecac> in <module>
18
18
 
19
19
  ----> 1 vec = dictionary.doc2bow(words)
20
-
21
- 2 print(vec)
22
-
23
-
24
-
25
- ~\Anaconda3\lib\site-packages\gensim\corpora\dictionary.py in doc2bow(self, document, allow_update, return_missing)
26
-
27
- 251 counter = defaultdict(int)
28
-
29
- 252 for w in document:
30
-
31
- --> 253 counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
32
-
33
- 254
34
-
35
- 255 token2id = self.token2id
36
-
37
-
38
20
 
39
21
  TypeError: decoding to str: need a bytes-like object, int found
40
22