回答編集履歴

1

edit

2017/12/05 04:40

投稿

mkgrei
mkgrei

スコア8560

test CHANGED
@@ -9,3 +9,169 @@
9
9
 
10
10
 
11
11
  であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
12
+
13
+
14
+
15
+ ---
16
+
17
+ ```python
18
+
19
+ #!/usr/bin/env python3
20
+
21
+ # -*- coding: utf-8 -*-
22
+
23
+
24
+
25
+ import time
26
+
27
+ import glob
28
+
29
+ import MeCab
30
+
31
+ from gensim import corpora, models
32
+
33
+ import numpy as np
34
+
35
+
36
+
37
+ def get_files(path):
38
+
39
+ return glob.glob(path)
40
+
41
+
42
+
43
+ def get_texts(files):
44
+
45
+ mecab = MeCab.Tagger ("-Ochasen")
46
+
47
+ texts = []
48
+
49
+ for file in files:
50
+
51
+ with open(file, "r",encoding = "utf-8") as f:
52
+
53
+ text = f.read()
54
+
55
+ chunks = mecab.parse(text).splitlines()
56
+
57
+ sels = []
58
+
59
+ for chunk in chunks:
60
+
61
+ cols = chunk.split('\t')
62
+
63
+ if len(cols) >= 4:
64
+
65
+ parts = cols[3].split('-')
66
+
67
+ if parts[0].startswith('名詞'):
68
+
69
+ if parts[1] in ['代名詞','非自立','固有名詞','数']:
70
+
71
+ continue
72
+
73
+ sels.append(cols[2])
74
+
75
+ texts.append(sels)
76
+
77
+ return texts
78
+
79
+
80
+
81
+ def get_dictionary(texts):
82
+
83
+ dictionary = corpora.Dictionary(texts)
84
+
85
+ return dictionary
86
+
87
+
88
+
89
+ def get_corpus(texts, dictionary):
90
+
91
+ corpus = [dictionary.doc2bow(text) for text in texts]
92
+
93
+ return corpus
94
+
95
+
96
+
97
+ def get_model(corpus, dictionary, num_topics=10):
98
+
99
+ lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
100
+
101
+ return model
102
+
103
+
104
+
105
+ def get_feature_vector(path, dictionary, model):
106
+
107
+ f = get_files(path)
108
+
109
+ t = get_texts(f)
110
+
111
+ c = get_corpus(t, dictionary)
112
+
113
+ return [p[1] for v in model[c] for p in v]
114
+
115
+
116
+
117
+ def metric_inverse_norm(a, b):
118
+
119
+ return 1./np.max(np.linalg.norm(np.array(a)-np.array(b)), 1E-10)
120
+
121
+
122
+
123
+ def metric_projection(a, b):
124
+
125
+ nb = np.array(b)
126
+
127
+ return np.dot(np.array(a), nb)/np.linalg.norm(nb)
128
+
129
+
130
+
131
+ def normalize_score(d):
132
+
133
+ s = sum([v for v in d.values()])
134
+
135
+ return {k: v/s for k, v in d.items()}
136
+
137
+
138
+
139
+ if __name__ == '__main__':
140
+
141
+ files = get_files('train_set/*.txt')
142
+
143
+ texts = get_texts(files)
144
+
145
+ dictionary = get_dictionary(texts)
146
+
147
+ corpus = get_corpus(texts, dictionary)
148
+
149
+ model = get_model(corpus, dictionary, num_topics=10)
150
+
151
+
152
+
153
+ genres = {'A': [], 'B': []} # 既知のジャンルを与える
154
+
155
+ for f in files:
156
+
157
+ v = get_feature_vector(f, dictionary, model)
158
+
159
+ genres[genre_for_f].append(v)
160
+
161
+
162
+
163
+ ts_v = get_feature_vector('test_set/test.txt', dictionary, model)
164
+
165
+ scores = {}
166
+
167
+ metric = metric_inverse_norm # 距離を1つ選択
168
+
169
+ for k, v in genres.items():
170
+
171
+ scores[k] = sum([metric(ts_v, vv) for vv in v])
172
+
173
+ scores = normalize(scores)
174
+
175
+ print(scores)
176
+
177
+ ```