リアルタイムのツイート群から抽出した名詞を対象に、辞書を構築したいと考えています。
ストップワードを除去してコードを実行したところ、下記のValueErrorが出てしまいました。
これに関して、解決策を教えていただけませんでしょうか?
宜しくお願いします。
エラー
Traceback (most recent call last): File "final.py", line 117, in <module> stream.sample() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample self._start(is_async) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start self._run() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run six.reraise(*exc_info) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise raise value File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run self._read_loop(resp) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop self._data(next_status_obj) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data if self.listener.on_data(data) is False: File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data if self.on_status(status) is False: File "final.py", line 99, in on_status tfidf = vectorizer.fit_transform(corpus) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform X = super().fit_transform(raw_documents) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform self.fixed_vocabulary_) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab raise ValueError("empty vocabulary; perhaps the documents only" ValueError: empty vocabulary; perhaps the documents only contain stop words
該当のソースコード
final.py
1import os 2import tweepy 3import redis 4import math 5from collections import Counter 6from collections import defaultdict 7import re 8from natto import MeCab 9import codecs 10import sys 11from sklearn.feature_extraction.text import TfidfVectorizer 12import glob 13import numpy as np 14import urllib.request 15from gensim import corpora 16from itertools import chain 17 18#frequency = defaultdict(int) 19 20#r = redis.Redis(host='localhost', port=6379, db=0) 21 22TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID'] 23TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET'] 24 25TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN'] 26TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET'] 27 28auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET) 29auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET) 30 31class StreamListener(tweepy.StreamListener): 32 def __init__(self): 33 super().__init__() 34 self.count = 0 # 取得したtweet数 35 36 def on_status(self, status): 37 text = str(status.text) 38 #https://www.pytry3g.com/entry/master-Preprocessing#正規表現を使う 39 text = re.sub(r"http\S+", "", text) 40 text = re.sub(r"@(\w+) ", "", text) 41 text = re.sub(r"#(\w+)", "", text) 42 #リツイートは消えていない 43 text = re.sub(r"(^RT.*)", "", text, flags=re.MULTILINE | re.DOTALL) 44 #絵文字も消したい 45 emoji_pattern = re.compile("[" 46 u"\U0001F600-\U0001F64F" 47 u"\U0001F300-\U0001F5FF" 48 u"\U0001F680-\U0001F6FF" 49 u"\U0001F1E0-\U0001F1FF" 50 "]+", flags=re.UNICODE) 51 text = emoji_pattern.sub("", text) 52 53 #日本語ツイートをファイルに書き込んでいる + ツイートの数を表示 54 if status.lang == "ja": 55 self.count += 1 56 print(self.count, text) 57 with open("test45.txt", "a", encoding="utf-8") as f: 58 f.write(text) 59 with codecs.open("test45.txt", "r", "utf-8") as f: 60 corpus = f.read().split("\n") 61 62 mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') 63 64 #if tagger.lang == 'ja': 65 #名詞の抽出と整形 66 rm_list = ["RT","https","co","さん","フォロー","本日","応募","今日","プレゼント","お金","FGO","無料","本人","投稿","動画","ツイート","リツイート","Twitter","ローソン","Peing","http","Amazonギフト券","bot","発売中","Youtube","www","WWW","質問箱","コラボ","フォロワー","DM","いいね","RT","lawson","://","!","peing","youtube","抽選","jp","リプ","キャンペーン","チケット","期間限定","DHC","日本","amp","人間","チャンネル","配信中","YouTube","WEB","楽しみ","イラスト","くじ","@","__"] 67 68 stop_words = [] 69 path = 'stop_words.txt' 70 with open(path) as g: 71 stop_words = g.readlines() 72 73 docs = [] 74 for txt in corpus: 75 words = mecab.parse(txt, as_nodes=True) 76 doc = [] 77 78 for w in words: 79 if w.feature.split(",")[0] == "名詞": 80 if len(w.surface) >= 3: 81 if not any(rm in w.surface for rm in rm_list): 82 doc.extend(str(w.surface)) 83 84 docs.extend(doc) 85 corpus = docs 86 87 #辞書の構築 88 dictionary = corpora.Dictionary([corpus]) 89 dictionary.filter_extremes(no_below=20, no_above=0.5) 90 dictionary.save_as_text('test40-dic8.txt') 91 92 #単語の頻出度の抽出 93 counter = Counter(chain.from_iterable(corpus)) 94 for word, count in counter.most_common(): 95 if count > 19: 96 print(f"{word}: {count}") 97 98 #tf-idf計算 99 vectorizer = TfidfVectorizer(min_df=0.02) 100 tfidf = vectorizer.fit_transform(corpus) 101 102 #スコアの表示 103 print(tfidf.toarray()) 104 # テキストの数、出現した単語の数 105 print(tfidf.shape) 106 107 #並べ替え 108 feature_names = np.array(vectorizer.get_feature_names()) 109 for vec in tfidf: 110 index = np.argsort(vec.toarray(), axis=1)[:,::-1] 111 feature_words = feature_names[index] 112 print(feature_words[:,:10]) 113 114 def on_error(self, status_code): 115 return False 116 117stream = tweepy.Stream(auth=auth, listener=StreamListener()) 118stream.sample()
補足情報(FW/ツールのバージョンなど)
iOS 10.12.6, Python 3.7.3, Atom
回答1件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。
2019/08/19 09:03