TwitterのAPIを使用して、ある言葉と共に呟かれてる言葉の頻出分析をしております。
下記のコードで、複数の言葉を含む検索も可能でしょうか?
例)学校&ひま が両方含まれる検索方法
コード
#検索キーワード q = "学校 exclude:retweets" date_since = "2020-01-01" #データ取得 tweet_doc =[] for tweet in tweepy.Cursor(api.search, q=q,tweet_mode="extended",since=date_since).items(500): tweet_doc.append(tweet.full_text) import MeCab def sep_by_mecab(text): m = MeCab.Tagger ('-Ochasen') node = m.parseToNode(text) word_list=[] while node: hinshi = node.feature.split(",")[0] if hinshi in ["名詞"]: origin = node.feature.split(",")[6] if origin not in ["*","する","いる","なる","てる","れる","ある","こと","もの","日","ん","な","の","大","ない","そう","いい","の","大"] : word_list.append(origin) node = node.next return word_list documents=[] for t in tweet_doc: documents.append(set(sep_by_mecab(t))) import pyfpgrowth import pprint #分析 patterns = pyfpgrowth.find_frequent_patterns(documents, 5) #降順に並び替えて見やすく表示 sorted_patterns = sorted(patterns.items(),reverse=True,key=lambda x:x[1]) pprint.pprint(sorted_patterns) from collections import Counter import pandas as pd pd.set_option('display.unicode.east_asian_width', True) lines =sorted_patterns # 単語毎に頻度を積算 words = {} for line in lines: for word in line[0]: if word not in words: words[word] = 0 words[word] += line[1] df = pd.DataFrame({'value':list(words.keys()), 'count':list(words.values())}) df = df.sort_values('count', ascending=False).reset_index(drop=True) df = pd.DataFrame(df, columns=['単語', '件数']) df.to_excel('pattern5.xlsx', index=None) import pytablewriter writer = pytablewriter.MarkdownTableWriter() writer.from_dataframe(df.head()) writer.write_table() import pandas as pd df = pd.DataFrame(sorted_patterns, columns=['単語', '件数']) df.to_excel('patterns6.xlsx', index=None)
あなたの回答
tips
プレビュー