編集履歴

質問編集履歴

全体のプログラムの目的を明示しました．

2022/10/18 02:25

投稿

maro

スコア13

title CHANGED Viewed

File without changes

body CHANGED Viewed

@@ -1,26 +1,30 @@
 ### 前提
 トピックモデルのPLSAの前処理を行う関数のプログラムで困っています．
+質問したい点はpreprocessingという関数についてです．
 この関数では，中国語の形態素解析を行うjiebaが用いられていますが，日本語の形態素解析を行いたいのでmecabを使用しようと考えています．ただ，これをmecabを用いて書くとエラーは出ないのですが，期待する実行結果が得られません．
-また，変数segListも何を格納しているのかが不明で困っています．
+また，変数segListも何を格納しているのかが理解できず，困っています．
+また，この全体のプログラムで行いたいことは，日本語の文書をトピックごとにクラスタリングすることが目的です．
 どうか，教えていただけませんでしょうか．
 ### 実現したいこと
-ここに実現したいことを箇条書きで書いてください。
 - [ ] 日本語の文書を形態素解析できるようにプログラムを改造する．
 ### 発生している問題・エラーメッセージ
+エラーは発生していませんが，期待する結果が得られません．
+期待する結果とは，日本語文書をトピックごとにクラスタリングすることです．
-```
-エラーメッセージ
-```
 ### 該当のソースコード
+全体のプログラムの中の質問個所の関数だけ示します．
 ```python
 def preprocessing(datasetFilePath, stopwordsFilePath):
@@ -42,11 +46,17 @@
     id2word = {}
     currentId = 0;
     # generate the word2id and id2word maps and count the number of times of words showing up in documents
-    # word2idとid2wordマップを生み出し，文書宙に出てくる単語の回数を数える．
+    # word2idとid2wordマップを生み出し，文書中に出てくる単語の回数を数える．
     for document in documents:
-        segList = jieba.cut(document)  #jieba:中国語の形態素解析
+        mecab = MeCab.Tagger("-Owakati")
+        mecab.parse("")
+        segList = mecab.parse(str(documents))
+        print(segList)
         wordCount = {}
         for word in segList:
             word = word.lower().strip() #lower:大文字と小文字を変換
@@ -75,12 +85,30 @@
     return N, M, word2id, id2word, X
 ```
 ### 試したこと
-ここに問題に対して試したことを記載してください。
-以下のようにプログラムを変えました．
 ### 補足情報（FW/ツールのバージョンなど）
+改造する前の全体のプログラムを示します．
 ```
+from numpy import zeros, int8, log
+from pylab import random
+import sys
+import jieba
+import re
+import time
+import codecs
+# segmentation, stopwords filtering and document-word matrix generating
+# [return]:
+# N : number of documents
+# M : length of dictionary
+# word2id : a map mapping terms to their corresponding ids
+# id2word : a map mapping ids to terms
+# X : document-word matrix, N*M, each line is the number of terms that show up in the document
 def preprocessing(datasetFilePath, stopwordsFilePath):
     # read the stopwords file
@@ -89,8 +117,8 @@
     file.close()
     # read the documents
-    file = codecs.open(datasetFilePath, 'r', 'utf-8')   #ファイルを開いてファイルオブジェクトを取得(codecs.open())
+    file = codecs.open(datasetFilePath, 'r', 'utf-8')
-    documents = [document.strip() for document in file]    #strip()で空白文字を削除
+    documents = [document.strip() for document in file]
     file.close()
     # number of documents
@@ -101,20 +129,11 @@
     id2word = {}
     currentId = 0;
     # generate the word2id and id2word maps and count the number of times of words showing up in documents
-    # word2idとid2wordマップを生み出し，文書中に出てくる単語の回数を数える．
     for document in documents:
-        mecab = MeCab.Tagger("-Owakati")
-        mecab.parse("")
-        segList = mecab.parse(str(documents))
+        segList = jieba.cut(document)
-        print(segList)
         wordCount = {}
         for word in segList:
-            word = word.lower().strip() #lower:大文字と小文字を変換
+            word = word.lower().strip()
             if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
                 if word not in word2id.keys():
                     word2id[word] = currentId;
@@ -129,8 +148,8 @@
     # length of dictionary
     M = len(word2id)
-    # generate the document-word matrix 行列
+    # generate the document-word matrix
-    X = zeros([N, M], int8) #8バイト整数 N行M列
+    X = zeros([N, M], int8)
     for word in word2id.keys():
         j = word2id[word]
         for i in range(0, N):
@@ -138,5 +157,161 @@
                 X[i, j] = wordCounts[i][word];
     return N, M, word2id, id2word, X
-```
+def initializeParameters():
+    for i in range(0, N):
+        normalization = sum(lamda[i, :])
+        for j in range(0, K):
+            lamda[i, j] /= normalization;
+    for i in range(0, K):
+        normalization = sum(theta[i, :])
+        for j in range(0, M):
+            theta[i, j] /= normalization;
+def EStep():
+    for i in range(0, N):
+        for j in range(0, M):
+            denominator = 0;
+            for k in range(0, K):
+                p[i, j, k] = theta[k, j] * lamda[i, k];
+                denominator += p[i, j, k];
+            if denominator == 0:
+                for k in range(0, K):
+                    p[i, j, k] = 0;
+            else:
+                for k in range(0, K):
+                    p[i, j, k] /= denominator;
+def MStep():
+    # update theta
+    for k in range(0, K):
+        denominator = 0
+        for j in range(0, M):
+            theta[k, j] = 0
+            for i in range(0, N):
+                theta[k, j] += X[i, j] * p[i, j, k]
+            denominator += theta[k, j]
+        if denominator == 0:
+            for j in range(0, M):
+                theta[k, j] = 1.0 / M
+        else:
+            for j in range(0, M):
+                theta[k, j] /= denominator
+    # update lamda
+    for i in range(0, N):
+        for k in range(0, K):
+            lamda[i, k] = 0
+            denominator = 0
+            for j in range(0, M):
+                lamda[i, k] += X[i, j] * p[i, j, k]
+                denominator += X[i, j];
+            if denominator == 0:
+                lamda[i, k] = 1.0 / K
+            else:
+                lamda[i, k] /= denominator
+# calculate the log likelihood
+def LogLikelihood():
+    loglikelihood = 0
+    for i in range(0, N):
+        for j in range(0, M):
+            tmp = 0
+            for k in range(0, K):
+                tmp += theta[k, j] * lamda[i, k]
+            if tmp > 0:
+                loglikelihood += X[i, j] * log(tmp)
+    return loglikelihood
+# output the params of model and top words of topics to files
+def output():
+    # document-topic distribution
+    file = codecs.open(docTopicDist,'w','utf-8')
+    for i in range(0, N):
+        tmp = ''
+        for j in range(0, K):
+            tmp += str(lamda[i, j]) + ' '
+        file.write(tmp + '\n')
+    file.close()
+    # topic-word distribution
+    file = codecs.open(topicWordDist,'w','utf-8')
+    for i in range(0, K):
+        tmp = ''
+        for j in range(0, M):
+            tmp += str(theta[i, j]) + ' '
+        file.write(tmp + '\n')
+    file.close()
+    # dictionary
+    file = codecs.open(dictionary,'w','utf-8')
+    for i in range(0, M):
+        file.write(id2word[i] + '\n')
+    file.close()
+    # top words of each topic
+    file = codecs.open(topicWords,'w','utf-8')
+    for i in range(0, K):
+        topicword = []
+        ids = theta[i, :].argsort()
+        for j in ids:
+            topicword.insert(0, id2word[j])
+        tmp = ''
+        for word in topicword[0:min(topicWordsNum, len(topicword))]:
+            tmp += word + ' '
+        file.write(tmp + '\n')
+    file.close()
+# set the default params and read the params from cmd
+datasetFilePath = 'dataset.txt'
+stopwordsFilePath = 'stopwords.dic'
+K = 10    # number of topic
+maxIteration = 30
+threshold = 10.0
+topicWordsNum = 10
+docTopicDist = 'docTopicDistribution.txt'
+topicWordDist = 'topicWordDistribution.txt'
+dictionary = 'dictionary.dic'
+topicWords = 'topics.txt'
+if(len(sys.argv) == 11):
+    datasetFilePath = sys.argv[1]
+    stopwordsFilePath = sys.argv[2]
+    K = int(sys.argv[3])
+    maxIteration = int(sys.argv[4])
+    threshold = float(sys.argv[5])
+    topicWordsNum = int(sys.argv[6])
+    docTopicDist = sys.argv[7]
+    topicWordDist = sys.argv[8]
+    dictionary = sys.argv[9]
+    topicWords = sys.argv[10]
+# preprocessing
+N, M, word2id, id2word, X = preprocessing(datasetFilePath, stopwordsFilePath)
+# lamda[i, j] : p(zj|di)
+lamda = random([N, K])
+# theta[i, j] : p(wj|zi)
+theta = random([K, M])
+# p[i, j, k] : p(zk|di,wj)
+p = zeros([N, M, K])
+initializeParameters()
+# EM algorithm
+oldLoglikelihood = 1
+newLoglikelihood = 1
+for i in range(0, maxIteration):
+    EStep()
+    MStep()
+    newLoglikelihood = LogLikelihood()
+    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
+    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
+        break
+    oldLoglikelihood = newLoglikelihood
+output()
+```

Mecab Python