ある文章についてgensimのTfidfModelを用いたところ、
(単語id,tfidf値)のリストを得ることが出来たのですが、一部単語id(単語'theのid)は含まれていませんでした。
リファレンスも確認しましたが、原因が分からず困っています。よろしくお願いします。
リファレンス : https://radimrehurek.com/gensim/models/tfidfmodel.html
python3
1 2#解析対象の文章 3corpus = [ 4 "The elephant sneezed at the sight of potatoes.", 5 "Bats can see via echolocation. See the bat sight sneeze!", 6 "Wondering, she opened the door to the studio.", 7] 8 9 10####import libraries 11import nltk 12nltk.download('punkt') 13import string 14import numpy as np 15import gensim 16import pandas as pd 17 18 19####単語をtoken化する関数1 20def tokenize(texts): 21 22 stem = nltk.stem.SnowballStemmer('english') 23 stem_list = [] 24 25 for text in texts: 26 text = text.lower() 27 28 for token in nltk.word_tokenize(text): 29 if token in string.punctuation: 30 continue 31 stem_list.append(stem.stem(token)) 32 return stem_list 33 34 35####単語をtoken化する関数2 36def tokenize2(text): 37 38 stem = nltk.stem.SnowballStemmer('english') 39 stem_list = [] 40 41 for token in nltk.word_tokenize(text): 42 if token in string.punctuation: 43 continue 44 45 stem_list.append(stem.stem(token)) 46 47 return stem_list 48 49 50###tf-idfを計算する関数 51def tf_idf_gensim(corpus): 52 53 dictionary = [tokenize2(doc) for doc in corpus] 54 55 print('dictionary : ',dictionary) 56 57 lexicon = gensim.corpora.Dictionary(dictionary) 58 id2token = lexicon.token2id 59 token2id = {v:k for k,v in lexicon.token2id.items()} 60 print(" token2id : ",token2id) 61 62 corpus_ = [lexicon.doc2bow(tokenize2(doc)) for doc in corpus] 63 print('token : id in corpus_[0]',[(token2id[a],b) for a, b in corpus_[0]]) 64 65 tfidf = gensim.models.TfidfModel(corpus= corpus_ ,dictionary=lexicon,normalize=True) 66 67 return [(token2id[token_id],tfidf_value)for token_id, tfidf_value in tfidf[corpus_[0]]] 68 69 70print(tf_idf_gensim(corpus)) 71
------出力結果------------------------------
dictionary : [['the', 'eleph', 'sneez', 'at', 'the', 'sight', 'of', 'potato'], ['bat', 'can', 'see', 'via', 'echoloc', 'see', 'the', 'bat', 'sight', 'sneez'], ['wonder', 'she', 'open', 'the', 'door', 'to', 'the', 'studio']]
token2id : {0: 'at', 1: 'eleph', 2: 'of', 3: 'potato', 4: 'sight', 5: 'sneez', 6: 'the', 7: 'bat', 8: 'can', 9: 'echoloc', 10: 'see', 11: 'via', 12: 'door', 13: 'open', 14: 'she', 15: 'studio', 16: 'to', 17: 'wonder'}
#tfidfに代入前
(token : id ) in corpus_[0]: [('at', 1), ('eleph', 1), ('of', 1), ('potato', 1), ('sight', 1), ('sneez', 1), ('the', 2)]
###tfidfに代入後->theがなくなっている
[('at', 0.4837965208957426), ('eleph', 0.4837965208957426), ('of', 0.4837965208957426), ('potato', 0.4837965208957426), ('sight', 0.17855490118826325), ('sneez', 0.17855490118826325)]
回答1件
あなたの回答
tips
プレビュー