前提
トピックモデルのPLSAの前処理を行う関数のプログラムで困っています.
質問したい点はpreprocessingという関数についてです.
この関数では,中国語の形態素解析を行うjiebaが用いられていますが,日本語の形態素解析を行いたいのでmecabを使用しようと考えています.ただ,これをmecabを用いて書くとエラーは出ないのですが,期待する実行結果が得られません.
また,変数segListも何を格納しているのかが理解できず,困っています.
また,この全体のプログラムで行いたいことは,日本語の文書をトピックごとにクラスタリングすることが目的です.
どうか,教えていただけませんでしょうか.
実現したいこと
- 日本語の文書を形態素解析できるようにプログラムを改造する.
発生している問題・エラーメッセージ
エラーは発生していませんが,期待する結果が得られません.
期待する結果とは,日本語文書をトピックごとにクラスタリングすることです.
該当のソースコード
全体のプログラムの中の質問個所の関数だけ示します.
python
1def preprocessing(datasetFilePath, stopwordsFilePath): 2 3 # read the stopwords file 4 file = codecs.open(stopwordsFilePath, 'r', 'utf-8') 5 stopwords = [line.strip() for line in file] 6 file.close() 7 8 # read the documents 9 file = codecs.open(datasetFilePath, 'r', 'utf-8') #ファイルを開いてファイルオブジェクトを取得(codecs.open()) 10 documents = [document.strip() for document in file] #strip()で空白文字を削除 11 file.close() 12 13 # number of documents 14 N = len(documents) 15 16 wordCounts = []; 17 word2id = {} 18 id2word = {} 19 currentId = 0; 20 # generate the word2id and id2word maps and count the number of times of words showing up in documents 21 # word2idとid2wordマップを生み出し,文書中に出てくる単語の回数を数える. 22 for document in documents: 23 24 25 mecab = MeCab.Tagger("-Owakati") 26 mecab.parse("") 27 segList = mecab.parse(str(documents)) 28 print(segList) 29 30 31 32 wordCount = {} 33 for word in segList: 34 word = word.lower().strip() #lower:大文字と小文字を変換 35 if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords: 36 if word not in word2id.keys(): 37 word2id[word] = currentId; 38 id2word[currentId] = word; 39 currentId += 1; 40 if word in wordCount: 41 wordCount[word] += 1 42 else: 43 wordCount[word] = 1 44 wordCounts.append(wordCount); 45 46 # length of dictionary 47 M = len(word2id) 48 49 # generate the document-word matrix 行列 50 X = zeros([N, M], int8) #8バイト整数 N行M列 51 for word in word2id.keys(): 52 j = word2id[word] 53 for i in range(0, N): 54 if word in wordCounts[i]: 55 X[i, j] = wordCounts[i][word]; 56 57 return N, M, word2id, id2word, X
試したこと
補足情報(FW/ツールのバージョンなど)
改造する前の全体のプログラムを示します.
from numpy import zeros, int8, log from pylab import random import sys import jieba import re import time import codecs # segmentation, stopwords filtering and document-word matrix generating # [return]: # N : number of documents # M : length of dictionary # word2id : a map mapping terms to their corresponding ids # id2word : a map mapping ids to terms # X : document-word matrix, N*M, each line is the number of terms that show up in the document def preprocessing(datasetFilePath, stopwordsFilePath): # read the stopwords file file = codecs.open(stopwordsFilePath, 'r', 'utf-8') stopwords = [line.strip() for line in file] file.close() # read the documents file = codecs.open(datasetFilePath, 'r', 'utf-8') documents = [document.strip() for document in file] file.close() # number of documents N = len(documents) wordCounts = []; word2id = {} id2word = {} currentId = 0; # generate the word2id and id2word maps and count the number of times of words showing up in documents for document in documents: segList = jieba.cut(document) wordCount = {} for word in segList: word = word.lower().strip() if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords: if word not in word2id.keys(): word2id[word] = currentId; id2word[currentId] = word; currentId += 1; if word in wordCount: wordCount[word] += 1 else: wordCount[word] = 1 wordCounts.append(wordCount); # length of dictionary M = len(word2id) # generate the document-word matrix X = zeros([N, M], int8) for word in word2id.keys(): j = word2id[word] for i in range(0, N): if word in wordCounts[i]: X[i, j] = wordCounts[i][word]; return N, M, word2id, id2word, X def initializeParameters(): for i in range(0, N): normalization = sum(lamda[i, :]) for j in range(0, K): lamda[i, j] /= normalization; for i in range(0, K): normalization = sum(theta[i, :]) for j in range(0, M): theta[i, j] /= normalization; def EStep(): for i in range(0, N): for j in range(0, M): denominator = 0; for k in range(0, K): p[i, j, k] = theta[k, j] * lamda[i, k]; denominator += p[i, j, k]; if denominator == 0: for k in range(0, K): p[i, j, k] = 0; else: for k in range(0, K): p[i, j, k] /= denominator; def MStep(): # update theta for k in range(0, K): denominator = 0 for j in range(0, M): theta[k, j] = 0 for i in range(0, N): theta[k, j] += X[i, j] * p[i, j, k] denominator += theta[k, j] if denominator == 0: for j in range(0, M): theta[k, j] = 1.0 / M else: for j in range(0, M): theta[k, j] /= denominator # update lamda for i in range(0, N): for k in range(0, K): lamda[i, k] = 0 denominator = 0 for j in range(0, M): lamda[i, k] += X[i, j] * p[i, j, k] denominator += X[i, j]; if denominator == 0: lamda[i, k] = 1.0 / K else: lamda[i, k] /= denominator # calculate the log likelihood def LogLikelihood(): loglikelihood = 0 for i in range(0, N): for j in range(0, M): tmp = 0 for k in range(0, K): tmp += theta[k, j] * lamda[i, k] if tmp > 0: loglikelihood += X[i, j] * log(tmp) return loglikelihood # output the params of model and top words of topics to files def output(): # document-topic distribution file = codecs.open(docTopicDist,'w','utf-8') for i in range(0, N): tmp = '' for j in range(0, K): tmp += str(lamda[i, j]) + ' ' file.write(tmp + '\n') file.close() # topic-word distribution file = codecs.open(topicWordDist,'w','utf-8') for i in range(0, K): tmp = '' for j in range(0, M): tmp += str(theta[i, j]) + ' ' file.write(tmp + '\n') file.close() # dictionary file = codecs.open(dictionary,'w','utf-8') for i in range(0, M): file.write(id2word[i] + '\n') file.close() # top words of each topic file = codecs.open(topicWords,'w','utf-8') for i in range(0, K): topicword = [] ids = theta[i, :].argsort() for j in ids: topicword.insert(0, id2word[j]) tmp = '' for word in topicword[0:min(topicWordsNum, len(topicword))]: tmp += word + ' ' file.write(tmp + '\n') file.close() # set the default params and read the params from cmd datasetFilePath = 'dataset.txt' stopwordsFilePath = 'stopwords.dic' K = 10 # number of topic maxIteration = 30 threshold = 10.0 topicWordsNum = 10 docTopicDist = 'docTopicDistribution.txt' topicWordDist = 'topicWordDistribution.txt' dictionary = 'dictionary.dic' topicWords = 'topics.txt' if(len(sys.argv) == 11): datasetFilePath = sys.argv[1] stopwordsFilePath = sys.argv[2] K = int(sys.argv[3]) maxIteration = int(sys.argv[4]) threshold = float(sys.argv[5]) topicWordsNum = int(sys.argv[6]) docTopicDist = sys.argv[7] topicWordDist = sys.argv[8] dictionary = sys.argv[9] topicWords = sys.argv[10] # preprocessing N, M, word2id, id2word, X = preprocessing(datasetFilePath, stopwordsFilePath) # lamda[i, j] : p(zj|di) lamda = random([N, K]) # theta[i, j] : p(wj|zi) theta = random([K, M]) # p[i, j, k] : p(zk|di,wj) p = zeros([N, M, K]) initializeParameters() # EM algorithm oldLoglikelihood = 1 newLoglikelihood = 1 for i in range(0, maxIteration): EStep() MStep() newLoglikelihood = LogLikelihood() print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration ", str(newLoglikelihood)) if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold): break oldLoglikelihood = newLoglikelihood output()

回答1件
あなたの回答
tips
プレビュー

