リアルタイムのツイート群からURL, 返信, RT(と絵文字)を削除したテキストを作成し名詞を抽出した後、辞書の構築を中心に、いくつかの処理をしたいと考えています。
今迄はツイートを収集する
def on_status(self, status):
に命令をまとめてしまっていたのですが、処理が重くなってしまい良くないのではと思いツイートを収集する部分であるclassから離れて書こうと思っています。
しかし、どういう構成を取れば上手く処理できるのか、ということに関して自分の知識と経験では見当もつきません。
これに関して、解決策を教えていただけますと幸いです。
宜しくお願いします。
エラー
Traceback (most recent call last): File "final.py", line 118, in <module> stream.sample() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample self._start(is_async) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start self._run() File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run six.reraise(*exc_info) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise raise value File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run self._read_loop(resp) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop self._data(next_status_obj) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data if self.listener.on_data(data) is False: File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data if self.on_status(status) is False: File "final.py", line 100, in on_status tfidf = vectorizer.fit_transform(corpus) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform X = super().fit_transform(raw_documents) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1079, in fit_transform max_features) File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 948, in _limit_features raise ValueError("After pruning, no terms remain. Try a lower" ValueError: After pruning, no terms remain. Try a lower min_df or a higher max_df. MACUsernoMacBook-Air:trendword macuser$ python final_2.py Traceback (most recent call last): File "final_2.py", line 47, in <module> f.write(text) NameError: name 'text' is not defined
該当のソースコード
python
1import os 2import tweepy 3import redis 4import math 5from collections import Counter 6from collections import defaultdict 7import re 8from natto import MeCab 9import codecs 10import sys 11from sklearn.feature_extraction.text import TfidfVectorizer 12import glob 13import numpy as np 14import urllib.request 15from gensim import corpora 16from itertools import chain 17 18#frequency = defaultdict(int) 19 20#r = redis.Redis(host='localhost', port=6379, db=0) 21 22TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID'] 23TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET'] 24 25TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN'] 26TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET'] 27 28auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET) 29auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET) 30 31class StreamListener(tweepy.StreamListener): 32 def __init__(self): 33 super().__init__() 34 self.count = 0 # 取得したtweet数 35 36 def on_status(self, status): 37 38 #日本語ツイートをファイルに書き込んでいる + ツイートの数を表示 39 if status.lang == "ja": 40 self.count += 1 41 print(self.count, text) 42 43 def on_error(self, status_code): 44 return False 45 46with open("test48.txt", "a", encoding="utf-8") as f: 47 f.write(text) 48with codecs.open("test48.txt", "r", "utf-8") as f: 49 corpus = f.read().split("\n") 50 51text = str(status.text) 52text = re.sub(r"http\S+", "", text) 53text = re.sub(r"@(\w+) ", "", text) 54#text = re.sub(r"#(\w+)", "", text) 55text = re.sub(r"(^RT.*)", "", text, flags=re.MULTILINE | re.DOTALL) 56emoji_pattern = re.compile("[" 57u"\U0001F600-\U0001F64F" 58u"\U0001F300-\U0001F5FF" 59u"\U0001F680-\U0001F6FF" 60u"\U0001F1E0-\U0001F1FF" 61"]+", flags=re.UNICODE) 62text = emoji_pattern.sub("", text) 63 64mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') 65 66#if tagger.lang == 'ja': 67#名詞の抽出と整形 68rm_list = ["RT","https","co","さん","フォロー","本日","応募","今日","プレゼント","お金","FGO","無料","本人","投稿","動画","ツイート","リツイート","Twitter","ローソン","Peing","http","Amazonギフト券","bot","発売中","Youtube","www","WWW","質問箱","コラボ","フォロワー","DM","いいね","RT","lawson","://","!","peing","youtube","抽選","jp","リプ","キャンペーン","チケット","期間限定","DHC","日本","amp","人間","チャンネル","配信中","YouTube","WEB","楽しみ","イラスト","くじ","@","__"] 69 70stop_words = [] 71path = 'stop_words.txt' 72with open(path) as g: 73 stop_words = g.readlines() 74 75docs = [] 76for txt in corpus: 77 words = mecab.parse(txt, as_nodes=True) 78 doc = [] 79 80 for w in words: 81 if w.feature.split(",")[0] == "名詞": 82 if len(w.surface) >= 3: 83 if w.surface not in rm_list: 84 doc.append(w.surface) 85 86 docs.append(doc) 87corpus = docs 88 89#辞書の構築 90dictionary = corpora.Dictionary(corpus) 91dictionary.filter_extremes(no_below=20, no_above=0.5) 92dictionary.save_as_text('test40-dic10.txt') 93 94#単語の頻出度の抽出 95counter = Counter(chain.from_iterable(corpus)) 96for word, count in counter.most_common(): 97 if count > 19: 98 print(f"{word}: {count}") 99 100#tf-idf計算 101vectorizer = TfidfVectorizer(min_df=0.02, analyzer=lambda x:x) 102tfidf = vectorizer.fit_transform(corpus) 103 104#スコアの表示 105print(tfidf.toarray()) 106# テキストの数、出現した単語の数 107print(tfidf.shape) 108 109#並べ替え 110feature_names = np.array(vectorizer.get_feature_names()) 111for vec in tfidf: 112 index = np.argsort(vec.toarray(), axis=1)[:,::-1] 113 feature_words = feature_names[index] 114 print(feature_words[:,:10]) 115 116stream = tweepy.Stream(auth=auth, listener=StreamListener()) 117stream.sample()
補足情報(FW/ツールのバージョンなど)
iOS 10.12.6, Python 3.7.3, Atom
回答1件
あなたの回答
tips
プレビュー