プログラムの改造，日本語形態素解析

前提

トピックモデルのPLSAの前処理を行う関数のプログラムで困っています．

質問したい点はpreprocessingという関数についてです．

この関数では，中国語の形態素解析を行うjiebaが用いられていますが，日本語の形態素解析を行いたいのでmecabを使用しようと考えています．ただ，これをmecabを用いて書くとエラーは出ないのですが，期待する実行結果が得られません．
また，変数segListも何を格納しているのかが理解できず，困っています．

また，この全体のプログラムで行いたいことは，日本語の文書をトピックごとにクラスタリングすることが目的です．

どうか，教えていただけませんでしょうか．

実現したいこと

日本語の文書を形態素解析できるようにプログラムを改造する．

発生している問題・エラーメッセージ

エラーは発生していませんが，期待する結果が得られません．
期待する結果とは，日本語文書をトピックごとにクラスタリングすることです．

該当のソースコード

全体のプログラムの中の質問個所の関数だけ示します．

python
1def preprocessing(datasetFilePath, stopwordsFilePath):
2    
3    # read the stopwords file
4    file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
5    stopwords = [line.strip() for line in file] 
6    file.close()
7    
8    # read the documents
9    file = codecs.open(datasetFilePath, 'r', 'utf-8')   #ファイルを開いてファイルオブジェクトを取得(codecs.open())
10    documents = [document.strip() for document in file]    #strip()で空白文字を削除
11    file.close()
12
13    # number of documents
14    N = len(documents)
15
16    wordCounts = [];
17    word2id = {}
18    id2word = {}
19    currentId = 0;
20    # generate the word2id and id2word maps and count the number of times of words showing up in documents
21    # word2idとid2wordマップを生み出し，文書中に出てくる単語の回数を数える．
22    for document in documents:
23        
24
25        mecab = MeCab.Tagger("-Owakati")
26        mecab.parse("")
27        segList = mecab.parse(str(documents))
28        print(segList)
29
30    
31
32        wordCount = {}
33        for word in segList:
34            word = word.lower().strip() #lower:大文字と小文字を変換
35            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:               
36                if word not in word2id.keys():
37                    word2id[word] = currentId;
38                    id2word[currentId] = word;
39                    currentId += 1;
40                if word in wordCount:
41                    wordCount[word] += 1
42                else:
43                    wordCount[word] = 1
44        wordCounts.append(wordCount);
45    
46    # length of dictionary
47    M = len(word2id)  
48
49    # generate the document-word matrix 行列
50    X = zeros([N, M], int8) #8バイト整数 N行M列
51    for word in word2id.keys():
52        j = word2id[word]
53        for i in range(0, N):
54            if word in wordCounts[i]:
55                X[i, j] = wordCounts[i][word];    
56
57    return N, M, word2id, id2word, X

試したこと

補足情報（FW/ツールのバージョンなど）

改造する前の全体のプログラムを示します．

from numpy import zeros, int8, log
from pylab import random
import sys
import jieba
import re
import time
import codecs

# segmentation, stopwords filtering and document-word matrix generating
# [return]:
# N : number of documents
# M : length of dictionary
# word2id : a map mapping terms to their corresponding ids
# id2word : a map mapping ids to terms
# X : document-word matrix, N*M, each line is the number of terms that show up in the document
def preprocessing(datasetFilePath, stopwordsFilePath):
    
    # read the stopwords file
    file = codecs.open(stopwordsFilePath, 'r', 'utf-8')
    stopwords = [line.strip() for line in file] 
    file.close()
    
    # read the documents
    file = codecs.open(datasetFilePath, 'r', 'utf-8')
    documents = [document.strip() for document in file] 
    file.close()

    # number of documents
    N = len(documents)

    wordCounts = [];
    word2id = {}
    id2word = {}
    currentId = 0;
    # generate the word2id and id2word maps and count the number of times of words showing up in documents
    for document in documents:
        segList = jieba.cut(document)
        wordCount = {}
        for word in segList:
            word = word.lower().strip()
            if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:               
                if word not in word2id.keys():
                    word2id[word] = currentId;
                    id2word[currentId] = word;
                    currentId += 1;
                if word in wordCount:
                    wordCount[word] += 1
                else:
                    wordCount[word] = 1
        wordCounts.append(wordCount);
    
    # length of dictionary
    M = len(word2id)  

    # generate the document-word matrix
    X = zeros([N, M], int8)
    for word in word2id.keys():
        j = word2id[word]
        for i in range(0, N):
            if word in wordCounts[i]:
                X[i, j] = wordCounts[i][word];    

    return N, M, word2id, id2word, X

def initializeParameters():
    for i in range(0, N):
        normalization = sum(lamda[i, :])
        for j in range(0, K):
            lamda[i, j] /= normalization;

    for i in range(0, K):
        normalization = sum(theta[i, :])
        for j in range(0, M):
            theta[i, j] /= normalization;

def EStep():
    for i in range(0, N):
        for j in range(0, M):
            denominator = 0;
            for k in range(0, K):
                p[i, j, k] = theta[k, j] * lamda[i, k];
                denominator += p[i, j, k];
            if denominator == 0:
                for k in range(0, K):
                    p[i, j, k] = 0;
            else:
                for k in range(0, K):
                    p[i, j, k] /= denominator;

def MStep():
    # update theta
    for k in range(0, K):
        denominator = 0
        for j in range(0, M):
            theta[k, j] = 0
            for i in range(0, N):
                theta[k, j] += X[i, j] * p[i, j, k]
            denominator += theta[k, j]
        if denominator == 0:
            for j in range(0, M):
                theta[k, j] = 1.0 / M
        else:
            for j in range(0, M):
                theta[k, j] /= denominator
        
    # update lamda
    for i in range(0, N):
        for k in range(0, K):
            lamda[i, k] = 0
            denominator = 0
            for j in range(0, M):
                lamda[i, k] += X[i, j] * p[i, j, k]
                denominator += X[i, j];
            if denominator == 0:
                lamda[i, k] = 1.0 / K
            else:
                lamda[i, k] /= denominator

# calculate the log likelihood
def LogLikelihood():
    loglikelihood = 0
    for i in range(0, N):
        for j in range(0, M):
            tmp = 0
            for k in range(0, K):
                tmp += theta[k, j] * lamda[i, k]
            if tmp > 0:
                loglikelihood += X[i, j] * log(tmp)
    return loglikelihood

# output the params of model and top words of topics to files
def output():
    # document-topic distribution
    file = codecs.open(docTopicDist,'w','utf-8')
    for i in range(0, N):
        tmp = ''
        for j in range(0, K):
            tmp += str(lamda[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()
    
    # topic-word distribution
    file = codecs.open(topicWordDist,'w','utf-8')
    for i in range(0, K):
        tmp = ''
        for j in range(0, M):
            tmp += str(theta[i, j]) + ' '
        file.write(tmp + '\n')
    file.close()
    
    # dictionary
    file = codecs.open(dictionary,'w','utf-8')
    for i in range(0, M):
        file.write(id2word[i] + '\n')
    file.close()
    
    # top words of each topic
    file = codecs.open(topicWords,'w','utf-8')
    for i in range(0, K):
        topicword = []
        ids = theta[i, :].argsort()
        for j in ids:
            topicword.insert(0, id2word[j])
        tmp = ''
        for word in topicword[0:min(topicWordsNum, len(topicword))]:
            tmp += word + ' '
        file.write(tmp + '\n')
    file.close()
    
# set the default params and read the params from cmd
datasetFilePath = 'dataset.txt'
stopwordsFilePath = 'stopwords.dic'
K = 10    # number of topic
maxIteration = 30
threshold = 10.0
topicWordsNum = 10
docTopicDist = 'docTopicDistribution.txt'
topicWordDist = 'topicWordDistribution.txt'
dictionary = 'dictionary.dic'
topicWords = 'topics.txt'
if(len(sys.argv) == 11):
    datasetFilePath = sys.argv[1]
    stopwordsFilePath = sys.argv[2]
    K = int(sys.argv[3])
    maxIteration = int(sys.argv[4])
    threshold = float(sys.argv[5])
    topicWordsNum = int(sys.argv[6])
    docTopicDist = sys.argv[7]
    topicWordDist = sys.argv[8]
    dictionary = sys.argv[9]
    topicWords = sys.argv[10]

# preprocessing
N, M, word2id, id2word, X = preprocessing(datasetFilePath, stopwordsFilePath)

# lamda[i, j] : p(zj|di)
lamda = random([N, K])

# theta[i, j] : p(wj|zi)
theta = random([K, M])

# p[i, j, k] : p(zk|di,wj)
p = zeros([N, M, K])

initializeParameters()

# EM algorithm
oldLoglikelihood = 1
newLoglikelihood = 1
for i in range(0, maxIteration):
    EStep()
    MStep()
    newLoglikelihood = LogLikelihood()
    print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration  ", str(newLoglikelihood))
    if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
        break
    oldLoglikelihood = newLoglikelihood

output()

quickquip

2022/10/18 01:49 編集

> 変数segListも何を格納しているのかが不明で困っていますこれは中国語版のコードの話ですか? > これをmecabを用いて書くとエラーは出ないのですが，期待する実行結果が得られません期待する実行結果といまどのような結果が得られているかを書いてください。この文には情報がありません。（飲食店に入って「俺が食べたいものを出してくれ」と注文してるようなものです）もしくは本当に「自分が期待する結果を教えてください」というニュアンスの質問でしょうか?

maro

2022/10/18 02:51

申し訳ありません． segListに関しては，中国語の文書を形態素解析するjiebaを用いたプログラムに関してです．期待する出力結果とは，日本語文書のテキストファイルを渡すと，トピックごとにクラスタリングされたものが出力されるものを指します．

行動規範の内容に同意します

回答1件

ベストアンサー

提示されたコード詳細およびやりたいことは理解できていませんが、-Owakatiでmecab.parseした結果は単語を空白で区切った文字列を返すので、単語毎に処理したい場合は.splitなりで空白毎に分割する必要があります。

Python
1import MeCab
2
3documents = ['今日の天気は晴れです。']
4for doc in documents:
5    print(f'[{doc}]') # [今日の天気は晴れです。]
6
7    mecab = MeCab.Tagger("-Owakati")
8    mecab.parse("")
9    segList = mecab.parse(doc)
10    print(f'[{segList}]')
11    #[今日 の 天気 は 晴れ です 。
12    #]
13
14    segList = segList.split(' ')
15    print(segList)
16    # ['今日', 'の', '天気', 'は', '晴れ', 'です', '。', '\n']