回答編集履歴

edit

2017/12/05 04:40

投稿

mkgrei

スコア8562

answer CHANGED Viewed

@@ -3,4 +3,87 @@
 gensim.model.ldamodal.Ldamodel
 の違いについての質問でしょうか？
-であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
+であれば、gensim.model.ldamodal.Ldamodelはgensim/model/ldamodal.pyにあるLdamodelクラスで、gensim.model.Ldamodelはアクセスしやすくしているだけかと。
+---
+```python
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import time
+import glob
+import MeCab
+from gensim import corpora, models
+import numpy as np
+def get_files(path):
+    return glob.glob(path)
+def get_texts(files):
+    mecab = MeCab.Tagger ("-Ochasen")
+    texts = []
+    for file in files:
+        with open(file, "r",encoding = "utf-8") as f:
+            text = f.read()
+        chunks = mecab.parse(text).splitlines()
+        sels = []
+        for chunk in chunks:
+            cols = chunk.split('\t')
+            if len(cols) >= 4:
+                parts = cols[3].split('-')
+                if parts[0].startswith('名詞'):
+                    if parts[1] in ['代名詞','非自立','固有名詞','数']:
+                        continue
+                    sels.append(cols[2])
+        texts.append(sels)
+    return texts
+def get_dictionary(texts):
+    dictionary = corpora.Dictionary(texts)
+    return dictionary
+def get_corpus(texts, dictionary):
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    return corpus
+def get_model(corpus, dictionary, num_topics=10):
+    lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
+    return model
+def get_feature_vector(path, dictionary, model):
+    f = get_files(path)
+    t = get_texts(f)
+    c = get_corpus(t, dictionary)
+    return [p[1] for v in model[c] for p in v]
+def metric_inverse_norm(a, b):
+    return 1./np.max(np.linalg.norm(np.array(a)-np.array(b)), 1E-10)
+def metric_projection(a, b):
+    nb = np.array(b)
+    return np.dot(np.array(a), nb)/np.linalg.norm(nb)
+def normalize_score(d):
+    s = sum([v for v in d.values()])
+    return {k: v/s for k, v in d.items()}
+if __name__ == '__main__':
+    files = get_files('train_set/*.txt')
+    texts = get_texts(files)
+    dictionary = get_dictionary(texts)
+    corpus = get_corpus(texts, dictionary)
+    model = get_model(corpus, dictionary, num_topics=10)
+    genres = {'A': [], 'B': []} # 既知のジャンルを与える
+    for f in files:
+        v = get_feature_vector(f, dictionary, model)
+        genres[genre_for_f].append(v)
+    ts_v = get_feature_vector('test_set/test.txt', dictionary, model)
+    scores = {}
+    metric = metric_inverse_norm # 距離を1つ選択
+    for k, v in genres.items():
+        scores[k] = sum([metric(ts_v, vv) for vv in v])
+    scores = normalize(scores)
+    print(scores)
+```