回答編集履歴
1
edit
answer
CHANGED
@@ -3,4 +3,87 @@
|
|
3
3
|
gensim.model.ldamodal.Ldamodel
|
4
4
|
の違いについての質問でしょうか?
|
5
5
|
|
6
|
-
であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
|
6
|
+
であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
|
7
|
+
|
8
|
+
---
|
9
|
+
```python
|
10
|
+
#!/usr/bin/env python3
|
11
|
+
# -*- coding: utf-8 -*-
|
12
|
+
|
13
|
+
import time
|
14
|
+
import glob
|
15
|
+
import MeCab
|
16
|
+
from gensim import corpora, models
|
17
|
+
import numpy as np
|
18
|
+
|
19
|
+
def get_files(path):
|
20
|
+
return glob.glob(path)
|
21
|
+
|
22
|
+
def get_texts(files):
|
23
|
+
mecab = MeCab.Tagger ("-Ochasen")
|
24
|
+
texts = []
|
25
|
+
for file in files:
|
26
|
+
with open(file, "r",encoding = "utf-8") as f:
|
27
|
+
text = f.read()
|
28
|
+
chunks = mecab.parse(text).splitlines()
|
29
|
+
sels = []
|
30
|
+
for chunk in chunks:
|
31
|
+
cols = chunk.split('\t')
|
32
|
+
if len(cols) >= 4:
|
33
|
+
parts = cols[3].split('-')
|
34
|
+
if parts[0].startswith('名詞'):
|
35
|
+
if parts[1] in ['代名詞','非自立','固有名詞','数']:
|
36
|
+
continue
|
37
|
+
sels.append(cols[2])
|
38
|
+
texts.append(sels)
|
39
|
+
return texts
|
40
|
+
|
41
|
+
def get_dictionary(texts):
|
42
|
+
dictionary = corpora.Dictionary(texts)
|
43
|
+
return dictionary
|
44
|
+
|
45
|
+
def get_corpus(texts, dictionary):
|
46
|
+
corpus = [dictionary.doc2bow(text) for text in texts]
|
47
|
+
return corpus
|
48
|
+
|
49
|
+
def get_model(corpus, dictionary, num_topics=10):
|
50
|
+
lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
|
51
|
+
return model
|
52
|
+
|
53
|
+
def get_feature_vector(path, dictionary, model):
|
54
|
+
f = get_files(path)
|
55
|
+
t = get_texts(f)
|
56
|
+
c = get_corpus(t, dictionary)
|
57
|
+
return [p[1] for v in model[c] for p in v]
|
58
|
+
|
59
|
+
def metric_inverse_norm(a, b):
|
60
|
+
return 1./np.max(np.linalg.norm(np.array(a)-np.array(b)), 1E-10)
|
61
|
+
|
62
|
+
def metric_projection(a, b):
|
63
|
+
nb = np.array(b)
|
64
|
+
return np.dot(np.array(a), nb)/np.linalg.norm(nb)
|
65
|
+
|
66
|
+
def normalize_score(d):
|
67
|
+
s = sum([v for v in d.values()])
|
68
|
+
return {k: v/s for k, v in d.items()}
|
69
|
+
|
70
|
+
if __name__ == '__main__':
|
71
|
+
files = get_files('train_set/*.txt')
|
72
|
+
texts = get_texts(files)
|
73
|
+
dictionary = get_dictionary(texts)
|
74
|
+
corpus = get_corpus(texts, dictionary)
|
75
|
+
model = get_model(corpus, dictionary, num_topics=10)
|
76
|
+
|
77
|
+
genres = {'A': [], 'B': []} # 既知のジャンルを与える
|
78
|
+
for f in files:
|
79
|
+
v = get_feature_vector(f, dictionary, model)
|
80
|
+
genres[genre_for_f].append(v)
|
81
|
+
|
82
|
+
ts_v = get_feature_vector('test_set/test.txt', dictionary, model)
|
83
|
+
scores = {}
|
84
|
+
metric = metric_inverse_norm # 距離を1つ選択
|
85
|
+
for k, v in genres.items():
|
86
|
+
scores[k] = sum([metric(ts_v, vv) for vv in v])
|
87
|
+
scores = normalize(scores)
|
88
|
+
print(scores)
|
89
|
+
```
|