回答編集履歴
1
edit
test
CHANGED
@@ -9,3 +9,169 @@
|
|
9
9
|
|
10
10
|
|
11
11
|
であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
---
|
16
|
+
|
17
|
+
```python
|
18
|
+
|
19
|
+
#!/usr/bin/env python3
|
20
|
+
|
21
|
+
# -*- coding: utf-8 -*-
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
import time
|
26
|
+
|
27
|
+
import glob
|
28
|
+
|
29
|
+
import MeCab
|
30
|
+
|
31
|
+
from gensim import corpora, models
|
32
|
+
|
33
|
+
import numpy as np
|
34
|
+
|
35
|
+
|
36
|
+
|
37
|
+
def get_files(path):
|
38
|
+
|
39
|
+
return glob.glob(path)
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
def get_texts(files):
|
44
|
+
|
45
|
+
mecab = MeCab.Tagger ("-Ochasen")
|
46
|
+
|
47
|
+
texts = []
|
48
|
+
|
49
|
+
for file in files:
|
50
|
+
|
51
|
+
with open(file, "r",encoding = "utf-8") as f:
|
52
|
+
|
53
|
+
text = f.read()
|
54
|
+
|
55
|
+
chunks = mecab.parse(text).splitlines()
|
56
|
+
|
57
|
+
sels = []
|
58
|
+
|
59
|
+
for chunk in chunks:
|
60
|
+
|
61
|
+
cols = chunk.split('\t')
|
62
|
+
|
63
|
+
if len(cols) >= 4:
|
64
|
+
|
65
|
+
parts = cols[3].split('-')
|
66
|
+
|
67
|
+
if parts[0].startswith('名詞'):
|
68
|
+
|
69
|
+
if parts[1] in ['代名詞','非自立','固有名詞','数']:
|
70
|
+
|
71
|
+
continue
|
72
|
+
|
73
|
+
sels.append(cols[2])
|
74
|
+
|
75
|
+
texts.append(sels)
|
76
|
+
|
77
|
+
return texts
|
78
|
+
|
79
|
+
|
80
|
+
|
81
|
+
def get_dictionary(texts):
|
82
|
+
|
83
|
+
dictionary = corpora.Dictionary(texts)
|
84
|
+
|
85
|
+
return dictionary
|
86
|
+
|
87
|
+
|
88
|
+
|
89
|
+
def get_corpus(texts, dictionary):
|
90
|
+
|
91
|
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
92
|
+
|
93
|
+
return corpus
|
94
|
+
|
95
|
+
|
96
|
+
|
97
|
+
def get_model(corpus, dictionary, num_topics=10):
|
98
|
+
|
99
|
+
lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
|
100
|
+
|
101
|
+
return model
|
102
|
+
|
103
|
+
|
104
|
+
|
105
|
+
def get_feature_vector(path, dictionary, model):
|
106
|
+
|
107
|
+
f = get_files(path)
|
108
|
+
|
109
|
+
t = get_texts(f)
|
110
|
+
|
111
|
+
c = get_corpus(t, dictionary)
|
112
|
+
|
113
|
+
return [p[1] for v in model[c] for p in v]
|
114
|
+
|
115
|
+
|
116
|
+
|
117
|
+
def metric_inverse_norm(a, b):
|
118
|
+
|
119
|
+
return 1./np.max(np.linalg.norm(np.array(a)-np.array(b)), 1E-10)
|
120
|
+
|
121
|
+
|
122
|
+
|
123
|
+
def metric_projection(a, b):
|
124
|
+
|
125
|
+
nb = np.array(b)
|
126
|
+
|
127
|
+
return np.dot(np.array(a), nb)/np.linalg.norm(nb)
|
128
|
+
|
129
|
+
|
130
|
+
|
131
|
+
def normalize_score(d):
|
132
|
+
|
133
|
+
s = sum([v for v in d.values()])
|
134
|
+
|
135
|
+
return {k: v/s for k, v in d.items()}
|
136
|
+
|
137
|
+
|
138
|
+
|
139
|
+
if __name__ == '__main__':
|
140
|
+
|
141
|
+
files = get_files('train_set/*.txt')
|
142
|
+
|
143
|
+
texts = get_texts(files)
|
144
|
+
|
145
|
+
dictionary = get_dictionary(texts)
|
146
|
+
|
147
|
+
corpus = get_corpus(texts, dictionary)
|
148
|
+
|
149
|
+
model = get_model(corpus, dictionary, num_topics=10)
|
150
|
+
|
151
|
+
|
152
|
+
|
153
|
+
genres = {'A': [], 'B': []} # 既知のジャンルを与える
|
154
|
+
|
155
|
+
for f in files:
|
156
|
+
|
157
|
+
v = get_feature_vector(f, dictionary, model)
|
158
|
+
|
159
|
+
genres[genre_for_f].append(v)
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
ts_v = get_feature_vector('test_set/test.txt', dictionary, model)
|
164
|
+
|
165
|
+
scores = {}
|
166
|
+
|
167
|
+
metric = metric_inverse_norm # 距離を1つ選択
|
168
|
+
|
169
|
+
for k, v in genres.items():
|
170
|
+
|
171
|
+
scores[k] = sum([metric(ts_v, vv) for vv in v])
|
172
|
+
|
173
|
+
scores = normalize(scores)
|
174
|
+
|
175
|
+
print(scores)
|
176
|
+
|
177
|
+
```
|