質問編集履歴

2

コードの改善

2019/07/18 11:25

投稿

jyon
jyon

スコア13

title CHANGED
File without changes
body CHANGED
@@ -15,9 +15,53 @@
15
15
  ### 該当のソースコード
16
16
 
17
17
  ```python
18
+ import MeCab
19
+ from gensim.corpora.dictionary import Dictionary
20
+ from gensim.models import LdaModel
21
+ from gensim.models import HdpModel
22
+ from collections import defaultdict
23
+
24
+ # MeCabオブジェクトの生成
25
+ mt = MeCab.Tagger('')
26
+
27
+ mt.parse('')
28
+
29
+ # トピック数の設定
30
+ NUM_TOPICS = 3
31
+ hdp_num_topics = 10
32
+
33
+ if __name__ == "__main__":
34
+ # トレーニングデータの読み込み
35
+ # train_texts は二次元のリスト
36
+ # テキストデータを一件ずつ分かち書き(名詞、動詞、形容詞に限定)して train_texts に格納するだけ
37
+ train_texts = []
38
+ with open('train.txt', 'r',encoding='utf-8') as f:
39
+ for line in f:
40
+ text = []
41
+ node = mt.parseToNode(line.strip())
42
+ while node:
43
+ fields = node.feature.split(",")
44
+ if fields[0] == '名詞':
45
+ text.append(node.surface)
46
+ node = node.next
47
+ train_texts.append(text)
48
+ words = Dictionary(train_texts)
49
+ print(words)
50
+
51
+ from gensim import corpora
52
+
53
+ # words はさっきの単語リスト
54
+ dictionary = corpora.Dictionary(train_texts)
55
+ print(dictionary.token2id)
56
+
57
+ # no_above: 使われてる文章の割合がno_above以上の場合無視
58
+ dictionary.filter_extremes(no_below=20, no_above=0.3)
59
+
60
+ dictionary.save_as_text('train.txt')
18
61
  words = bytes('words', 'UTF-8')
19
- dictionary = corpora.Dictionary.load_from_text('livedoordic.txt')
62
+ dictionary = corpora.Dictionary.load_from_text('train.txt')
20
63
  type(words)
64
+
21
65
  vec = dictionary.doc2bow(words)
22
66
  print(vec)
23
67
  ```

1

誤字

2019/07/18 11:25

投稿

jyon
jyon

スコア13

title CHANGED
File without changes
body CHANGED
@@ -8,15 +8,6 @@
8
8
  TypeError Traceback (most recent call last)
9
9
  <ipython-input-55-b3b1a98eecac> in <module>
10
10
  ----> 1 vec = dictionary.doc2bow(words)
11
- 2 print(vec)
12
-
13
- ~\Anaconda3\lib\site-packages\gensim\corpora\dictionary.py in doc2bow(self, document, allow_update, return_missing)
14
- 251 counter = defaultdict(int)
15
- 252 for w in document:
16
- --> 253 counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
17
- 254
18
- 255 token2id = self.token2id
19
-
20
11
  TypeError: decoding to str: need a bytes-like object, int found
21
12
  ```
22
13