質問編集履歴
1
追記
test
CHANGED
File without changes
|
test
CHANGED
@@ -169,3 +169,103 @@
|
|
169
169
|
どのように改善すればよろしいですか
|
170
170
|
|
171
171
|
よろしくお願いします
|
172
|
+
|
173
|
+
|
174
|
+
|
175
|
+
ちなみに並列化しないバージョンはこれです
|
176
|
+
|
177
|
+
```python
|
178
|
+
|
179
|
+
import nltk
|
180
|
+
|
181
|
+
import numpy as np
|
182
|
+
|
183
|
+
import json
|
184
|
+
|
185
|
+
import nltk_exa as nl
|
186
|
+
|
187
|
+
import time
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
#def tfidf(word):
|
192
|
+
|
193
|
+
def tfidf():
|
194
|
+
|
195
|
+
|
196
|
+
|
197
|
+
word = []
|
198
|
+
|
199
|
+
f = open("word0_a.txt")
|
200
|
+
|
201
|
+
line = f.readline()
|
202
|
+
|
203
|
+
while line:
|
204
|
+
|
205
|
+
line = f.readline()
|
206
|
+
|
207
|
+
word.append(line.replace("[","").replace("]","").replace(",","").replace("\"","").split())
|
208
|
+
|
209
|
+
f.close()
|
210
|
+
|
211
|
+
word.pop()
|
212
|
+
|
213
|
+
#tfidfの計算
|
214
|
+
|
215
|
+
doc = []
|
216
|
+
|
217
|
+
lists = []
|
218
|
+
|
219
|
+
collection = nltk.TextCollection(word) #サイトにのっていた
|
220
|
+
|
221
|
+
t1 = time.time()
|
222
|
+
|
223
|
+
for do in word:
|
224
|
+
|
225
|
+
wo=[]
|
226
|
+
|
227
|
+
for term in set(do):
|
228
|
+
|
229
|
+
if(collection.tf_idf(term, do) > 0):
|
230
|
+
|
231
|
+
wo.append([term,collection.tf_idf(term, do)]) #ここも上のサイトにのってる
|
232
|
+
|
233
|
+
#print(wo)
|
234
|
+
|
235
|
+
wo.sort(key=lambda x:x[1])
|
236
|
+
|
237
|
+
wo.reverse()
|
238
|
+
|
239
|
+
#print(wo)
|
240
|
+
|
241
|
+
slice1 = np.array(wo[:20]) #先頭の文字から終了インデックスまでが抽出
|
242
|
+
|
243
|
+
lists = slice1[:,0] #[:]は戦闘から終了のインデックスまで抽出と、slice1の0番目を格納
|
244
|
+
|
245
|
+
doc.append(list(lists)) #listsが文字列だから、リストに格納
|
246
|
+
|
247
|
+
#print(doc[0])
|
248
|
+
|
249
|
+
t2 = time.time()
|
250
|
+
|
251
|
+
#print('processing time1(一回のfor文): ' + str(t2 - t1) + '(sec)')
|
252
|
+
|
253
|
+
del wo
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
print(doc)
|
258
|
+
|
259
|
+
t3 = time.time()
|
260
|
+
|
261
|
+
print('processing time2(終わり): ' + str(t3 - t1) + '(sec)')
|
262
|
+
|
263
|
+
|
264
|
+
|
265
|
+
if __name__ == "__main__":
|
266
|
+
|
267
|
+
tfidf()
|
268
|
+
|
269
|
+
|
270
|
+
|
271
|
+
```
|