質問編集履歴
1
全体のプログラムの目的を明示しました.
test
CHANGED
File without changes
|
test
CHANGED
@@ -1,8 +1,12 @@
|
|
1
1
|
### 前提
|
2
2
|
トピックモデルのPLSAの前処理を行う関数のプログラムで困っています.
|
3
3
|
|
4
|
+
質問したい点はpreprocessingという関数についてです.
|
5
|
+
|
4
6
|
この関数では,中国語の形態素解析を行うjiebaが用いられていますが,日本語の形態素解析を行いたいのでmecabを使用しようと考えています.ただ,これをmecabを用いて書くとエラーは出ないのですが,期待する実行結果が得られません.
|
5
|
-
また,変数segListも何を格納しているのかが
|
7
|
+
また,変数segListも何を格納しているのかが理解できず,困っています.
|
8
|
+
|
9
|
+
また,この全体のプログラムで行いたいことは,日本語の文書をトピックごとにクラスタリングすることが目的です.
|
6
10
|
|
7
11
|
どうか,教えていただけませんでしょうか.
|
8
12
|
|
@@ -10,17 +14,17 @@
|
|
10
14
|
|
11
15
|
### 実現したいこと
|
12
16
|
|
13
|
-
|
17
|
+
|
14
18
|
- [ ] 日本語の文書を形態素解析できるようにプログラムを改造する.
|
15
19
|
|
16
20
|
### 発生している問題・エラーメッセージ
|
17
|
-
|
18
|
-
```
|
19
|
-
エラー
|
21
|
+
エラーは発生していませんが,期待する結果が得られません.
|
20
|
-
|
22
|
+
期待する結果とは,日本語文書をトピックごとにクラスタリングすることです.
|
23
|
+
|
24
|
+
|
21
25
|
|
22
26
|
### 該当のソースコード
|
23
|
-
|
27
|
+
全体のプログラムの中の質問個所の関数だけ示します.
|
24
28
|
```python
|
25
29
|
def preprocessing(datasetFilePath, stopwordsFilePath):
|
26
30
|
|
@@ -42,11 +46,17 @@
|
|
42
46
|
id2word = {}
|
43
47
|
currentId = 0;
|
44
48
|
# generate the word2id and id2word maps and count the number of times of words showing up in documents
|
45
|
-
# word2idとid2wordマップを生み出し,文書
|
49
|
+
# word2idとid2wordマップを生み出し,文書中に出てくる単語の回数を数える.
|
46
50
|
for document in documents:
|
47
|
-
segList = jieba.cut(document) #jieba:中国語の形態素解析
|
48
|
-
|
49
51
|
|
52
|
+
|
53
|
+
mecab = MeCab.Tagger("-Owakati")
|
54
|
+
mecab.parse("")
|
55
|
+
segList = mecab.parse(str(documents))
|
56
|
+
print(segList)
|
57
|
+
|
58
|
+
|
59
|
+
|
50
60
|
wordCount = {}
|
51
61
|
for word in segList:
|
52
62
|
word = word.lower().strip() #lower:大文字と小文字を変換
|
@@ -75,12 +85,30 @@
|
|
75
85
|
return N, M, word2id, id2word, X
|
76
86
|
```
|
77
87
|
|
88
|
+
|
89
|
+
|
90
|
+
|
78
91
|
### 試したこと
|
79
92
|
|
80
|
-
|
93
|
+
|
81
|
-
以下のようにプログラムを変えました.
|
82
94
|
### 補足情報(FW/ツールのバージョンなど)
|
95
|
+
改造する前の全体のプログラムを示します.
|
83
96
|
```
|
97
|
+
from numpy import zeros, int8, log
|
98
|
+
from pylab import random
|
99
|
+
import sys
|
100
|
+
import jieba
|
101
|
+
import re
|
102
|
+
import time
|
103
|
+
import codecs
|
104
|
+
|
105
|
+
# segmentation, stopwords filtering and document-word matrix generating
|
106
|
+
# [return]:
|
107
|
+
# N : number of documents
|
108
|
+
# M : length of dictionary
|
109
|
+
# word2id : a map mapping terms to their corresponding ids
|
110
|
+
# id2word : a map mapping ids to terms
|
111
|
+
# X : document-word matrix, N*M, each line is the number of terms that show up in the document
|
84
112
|
def preprocessing(datasetFilePath, stopwordsFilePath):
|
85
113
|
|
86
114
|
# read the stopwords file
|
@@ -89,8 +117,8 @@
|
|
89
117
|
file.close()
|
90
118
|
|
91
119
|
# read the documents
|
92
|
-
file = codecs.open(datasetFilePath, 'r', 'utf-8')
|
120
|
+
file = codecs.open(datasetFilePath, 'r', 'utf-8')
|
93
|
-
documents = [document.strip() for document in file]
|
121
|
+
documents = [document.strip() for document in file]
|
94
122
|
file.close()
|
95
123
|
|
96
124
|
# number of documents
|
@@ -101,20 +129,11 @@
|
|
101
129
|
id2word = {}
|
102
130
|
currentId = 0;
|
103
131
|
# generate the word2id and id2word maps and count the number of times of words showing up in documents
|
104
|
-
# word2idとid2wordマップを生み出し,文書中に出てくる単語の回数を数える.
|
105
132
|
for document in documents:
|
106
|
-
|
107
|
-
|
108
|
-
mecab = MeCab.Tagger("-Owakati")
|
109
|
-
mecab.parse("")
|
110
|
-
segList =
|
133
|
+
segList = jieba.cut(document)
|
111
|
-
print(segList)
|
112
|
-
|
113
|
-
|
114
|
-
|
115
134
|
wordCount = {}
|
116
135
|
for word in segList:
|
117
|
-
word = word.lower().strip()
|
136
|
+
word = word.lower().strip()
|
118
137
|
if len(word) > 1 and not re.search('[0-9]', word) and word not in stopwords:
|
119
138
|
if word not in word2id.keys():
|
120
139
|
word2id[word] = currentId;
|
@@ -129,8 +148,8 @@
|
|
129
148
|
# length of dictionary
|
130
149
|
M = len(word2id)
|
131
150
|
|
132
|
-
# generate the document-word matrix
|
151
|
+
# generate the document-word matrix
|
133
|
-
X = zeros([N, M], int8)
|
152
|
+
X = zeros([N, M], int8)
|
134
153
|
for word in word2id.keys():
|
135
154
|
j = word2id[word]
|
136
155
|
for i in range(0, N):
|
@@ -138,6 +157,161 @@
|
|
138
157
|
X[i, j] = wordCounts[i][word];
|
139
158
|
|
140
159
|
return N, M, word2id, id2word, X
|
160
|
+
|
161
|
+
def initializeParameters():
|
162
|
+
for i in range(0, N):
|
163
|
+
normalization = sum(lamda[i, :])
|
164
|
+
for j in range(0, K):
|
165
|
+
lamda[i, j] /= normalization;
|
166
|
+
|
167
|
+
for i in range(0, K):
|
168
|
+
normalization = sum(theta[i, :])
|
169
|
+
for j in range(0, M):
|
170
|
+
theta[i, j] /= normalization;
|
171
|
+
|
172
|
+
def EStep():
|
173
|
+
for i in range(0, N):
|
174
|
+
for j in range(0, M):
|
175
|
+
denominator = 0;
|
176
|
+
for k in range(0, K):
|
177
|
+
p[i, j, k] = theta[k, j] * lamda[i, k];
|
178
|
+
denominator += p[i, j, k];
|
179
|
+
if denominator == 0:
|
180
|
+
for k in range(0, K):
|
181
|
+
p[i, j, k] = 0;
|
182
|
+
else:
|
183
|
+
for k in range(0, K):
|
184
|
+
p[i, j, k] /= denominator;
|
185
|
+
|
186
|
+
def MStep():
|
187
|
+
# update theta
|
188
|
+
for k in range(0, K):
|
189
|
+
denominator = 0
|
190
|
+
for j in range(0, M):
|
191
|
+
theta[k, j] = 0
|
192
|
+
for i in range(0, N):
|
193
|
+
theta[k, j] += X[i, j] * p[i, j, k]
|
194
|
+
denominator += theta[k, j]
|
195
|
+
if denominator == 0:
|
196
|
+
for j in range(0, M):
|
197
|
+
theta[k, j] = 1.0 / M
|
198
|
+
else:
|
199
|
+
for j in range(0, M):
|
200
|
+
theta[k, j] /= denominator
|
201
|
+
|
202
|
+
# update lamda
|
203
|
+
for i in range(0, N):
|
204
|
+
for k in range(0, K):
|
205
|
+
lamda[i, k] = 0
|
206
|
+
denominator = 0
|
207
|
+
for j in range(0, M):
|
208
|
+
lamda[i, k] += X[i, j] * p[i, j, k]
|
209
|
+
denominator += X[i, j];
|
210
|
+
if denominator == 0:
|
211
|
+
lamda[i, k] = 1.0 / K
|
212
|
+
else:
|
213
|
+
lamda[i, k] /= denominator
|
214
|
+
|
215
|
+
# calculate the log likelihood
|
216
|
+
def LogLikelihood():
|
217
|
+
loglikelihood = 0
|
218
|
+
for i in range(0, N):
|
219
|
+
for j in range(0, M):
|
220
|
+
tmp = 0
|
221
|
+
for k in range(0, K):
|
222
|
+
tmp += theta[k, j] * lamda[i, k]
|
223
|
+
if tmp > 0:
|
224
|
+
loglikelihood += X[i, j] * log(tmp)
|
225
|
+
return loglikelihood
|
226
|
+
|
227
|
+
# output the params of model and top words of topics to files
|
228
|
+
def output():
|
229
|
+
# document-topic distribution
|
230
|
+
file = codecs.open(docTopicDist,'w','utf-8')
|
231
|
+
for i in range(0, N):
|
232
|
+
tmp = ''
|
233
|
+
for j in range(0, K):
|
234
|
+
tmp += str(lamda[i, j]) + ' '
|
235
|
+
file.write(tmp + '\n')
|
236
|
+
file.close()
|
237
|
+
|
238
|
+
# topic-word distribution
|
239
|
+
file = codecs.open(topicWordDist,'w','utf-8')
|
240
|
+
for i in range(0, K):
|
241
|
+
tmp = ''
|
242
|
+
for j in range(0, M):
|
243
|
+
tmp += str(theta[i, j]) + ' '
|
244
|
+
file.write(tmp + '\n')
|
245
|
+
file.close()
|
246
|
+
|
247
|
+
# dictionary
|
248
|
+
file = codecs.open(dictionary,'w','utf-8')
|
249
|
+
for i in range(0, M):
|
250
|
+
file.write(id2word[i] + '\n')
|
251
|
+
file.close()
|
252
|
+
|
253
|
+
# top words of each topic
|
254
|
+
file = codecs.open(topicWords,'w','utf-8')
|
255
|
+
for i in range(0, K):
|
256
|
+
topicword = []
|
257
|
+
ids = theta[i, :].argsort()
|
258
|
+
for j in ids:
|
259
|
+
topicword.insert(0, id2word[j])
|
260
|
+
tmp = ''
|
261
|
+
for word in topicword[0:min(topicWordsNum, len(topicword))]:
|
262
|
+
tmp += word + ' '
|
263
|
+
file.write(tmp + '\n')
|
264
|
+
file.close()
|
265
|
+
|
266
|
+
# set the default params and read the params from cmd
|
267
|
+
datasetFilePath = 'dataset.txt'
|
268
|
+
stopwordsFilePath = 'stopwords.dic'
|
269
|
+
K = 10 # number of topic
|
270
|
+
maxIteration = 30
|
271
|
+
threshold = 10.0
|
272
|
+
topicWordsNum = 10
|
273
|
+
docTopicDist = 'docTopicDistribution.txt'
|
274
|
+
topicWordDist = 'topicWordDistribution.txt'
|
275
|
+
dictionary = 'dictionary.dic'
|
276
|
+
topicWords = 'topics.txt'
|
277
|
+
if(len(sys.argv) == 11):
|
278
|
+
datasetFilePath = sys.argv[1]
|
279
|
+
stopwordsFilePath = sys.argv[2]
|
280
|
+
K = int(sys.argv[3])
|
281
|
+
maxIteration = int(sys.argv[4])
|
282
|
+
threshold = float(sys.argv[5])
|
283
|
+
topicWordsNum = int(sys.argv[6])
|
284
|
+
docTopicDist = sys.argv[7]
|
285
|
+
topicWordDist = sys.argv[8]
|
286
|
+
dictionary = sys.argv[9]
|
287
|
+
topicWords = sys.argv[10]
|
288
|
+
|
289
|
+
# preprocessing
|
290
|
+
N, M, word2id, id2word, X = preprocessing(datasetFilePath, stopwordsFilePath)
|
291
|
+
|
292
|
+
# lamda[i, j] : p(zj|di)
|
293
|
+
lamda = random([N, K])
|
294
|
+
|
295
|
+
# theta[i, j] : p(wj|zi)
|
296
|
+
theta = random([K, M])
|
297
|
+
|
298
|
+
# p[i, j, k] : p(zk|di,wj)
|
299
|
+
p = zeros([N, M, K])
|
300
|
+
|
301
|
+
initializeParameters()
|
302
|
+
|
303
|
+
# EM algorithm
|
304
|
+
oldLoglikelihood = 1
|
305
|
+
newLoglikelihood = 1
|
306
|
+
for i in range(0, maxIteration):
|
307
|
+
EStep()
|
308
|
+
MStep()
|
309
|
+
newLoglikelihood = LogLikelihood()
|
310
|
+
print("[", time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())), "] ", i+1, " iteration ", str(newLoglikelihood))
|
311
|
+
if(oldLoglikelihood != 1 and newLoglikelihood - oldLoglikelihood < threshold):
|
312
|
+
break
|
313
|
+
oldLoglikelihood = newLoglikelihood
|
314
|
+
|
315
|
+
output()
|
316
|
+
|
141
317
|
```
|
142
|
-
|
143
|
-
|