teratail header banner
teratail header banner
質問するログイン新規登録

回答編集履歴

1

edit

2017/12/05 04:40

投稿

mkgrei
mkgrei

スコア8562

answer CHANGED
@@ -3,4 +3,87 @@
3
3
  gensim.model.ldamodal.Ldamodel
4
4
  の違いについての質問でしょうか?
5
5
 
6
- であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
6
+ であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
7
+
8
+ ---
9
+ ```python
10
+ #!/usr/bin/env python3
11
+ # -*- coding: utf-8 -*-
12
+
13
+ import time
14
+ import glob
15
+ import MeCab
16
+ from gensim import corpora, models
17
+ import numpy as np
18
+
19
+ def get_files(path):
20
+ return glob.glob(path)
21
+
22
+ def get_texts(files):
23
+ mecab = MeCab.Tagger ("-Ochasen")
24
+ texts = []
25
+ for file in files:
26
+ with open(file, "r",encoding = "utf-8") as f:
27
+ text = f.read()
28
+ chunks = mecab.parse(text).splitlines()
29
+ sels = []
30
+ for chunk in chunks:
31
+ cols = chunk.split('\t')
32
+ if len(cols) >= 4:
33
+ parts = cols[3].split('-')
34
+ if parts[0].startswith('名詞'):
35
+ if parts[1] in ['代名詞','非自立','固有名詞','数']:
36
+ continue
37
+ sels.append(cols[2])
38
+ texts.append(sels)
39
+ return texts
40
+
41
+ def get_dictionary(texts):
42
+ dictionary = corpora.Dictionary(texts)
43
+ return dictionary
44
+
45
+ def get_corpus(texts, dictionary):
46
+ corpus = [dictionary.doc2bow(text) for text in texts]
47
+ return corpus
48
+
49
+ def get_model(corpus, dictionary, num_topics=10):
50
+ lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
51
+ return model
52
+
53
+ def get_feature_vector(path, dictionary, model):
54
+ f = get_files(path)
55
+ t = get_texts(f)
56
+ c = get_corpus(t, dictionary)
57
+ return [p[1] for v in model[c] for p in v]
58
+
59
+ def metric_inverse_norm(a, b):
60
+ return 1./np.max(np.linalg.norm(np.array(a)-np.array(b)), 1E-10)
61
+
62
+ def metric_projection(a, b):
63
+ nb = np.array(b)
64
+ return np.dot(np.array(a), nb)/np.linalg.norm(nb)
65
+
66
+ def normalize_score(d):
67
+ s = sum([v for v in d.values()])
68
+ return {k: v/s for k, v in d.items()}
69
+
70
+ if __name__ == '__main__':
71
+ files = get_files('train_set/*.txt')
72
+ texts = get_texts(files)
73
+ dictionary = get_dictionary(texts)
74
+ corpus = get_corpus(texts, dictionary)
75
+ model = get_model(corpus, dictionary, num_topics=10)
76
+
77
+ genres = {'A': [], 'B': []} # 既知のジャンルを与える
78
+ for f in files:
79
+ v = get_feature_vector(f, dictionary, model)
80
+ genres[genre_for_f].append(v)
81
+
82
+ ts_v = get_feature_vector('test_set/test.txt', dictionary, model)
83
+ scores = {}
84
+ metric = metric_inverse_norm # 距離を1つ選択
85
+ for k, v in genres.items():
86
+ scores[k] = sum([metric(ts_v, vv) for vv in v])
87
+ scores = normalize(scores)
88
+ print(scores)
89
+ ```