回答率: 85.47%

質問するログイン新規登録

トップに関する質問 pythonでTFIDFの処理を並列処理するには

編集履歴

質問編集履歴

1

追記

2017/07/06 11:42

投稿

スコア140

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -169,3 +169,103 @@
 どのように改善すればよろしいですか
 よろしくお願いします
+ちなみに並列化しないバージョンはこれです
+```python
+import nltk
+import numpy as np
+import json
+import nltk_exa as nl
+import time
+#def tfidf(word):
+def tfidf():
+    word = []
+    f = open("word0_a.txt")
+    line = f.readline()
+    while line:
+        line = f.readline()
+        word.append(line.replace("[","").replace("]","").replace(",","").replace("\"","").split())
+    f.close()
+    word.pop()
+    #tfidfの計算
+    doc = []
+    lists = []
+    collection = nltk.TextCollection(word) #サイトにのっていた
+    t1 = time.time()
+    for do in word:
+        wo=[]
+        for term in set(do):
+            if(collection.tf_idf(term, do) > 0):
+                wo.append([term,collection.tf_idf(term, do)]) #ここも上のサイトにのってる
+                #print(wo)
+        wo.sort(key=lambda x:x[1])
+        wo.reverse()
+        #print(wo)
+        slice1 = np.array(wo[:20]) #先頭の文字から終了インデックスまでが抽出
+        lists = slice1[:,0] #[:]は戦闘から終了のインデックスまで抽出と、slice1の0番目を格納
+        doc.append(list(lists)) #listsが文字列だから、リストに格納
+        #print(doc[0])
+        t2 = time.time()
+        #print('processing time1(一回のfor文): ' + str(t2 - t1) + '(sec)')
+        del wo
+    print(doc)
+    t3 = time.time()
+    print('processing time2(終わり): ' + str(t3 - t1) + '(sec)')
+if __name__ == "__main__":
+        tfidf()
+```