リアルタイムのツイート群から抽出した名詞を使って、gensim経由でトピック分析をしたいと考えています。
初歩的な質問で恐縮ですが、エラーの解決方法を教えていただけますと幸いです。
宜しくお願いします。
エラー
Traceback (most recent call last): File "topic.py", line 21, in <module> text = corpus.text AttributeError: 'list' object has no attribute 'text'
該当のソースコード
topic.py
1import os 2import math 3from collections import Counter 4from collections import defaultdict 5import re 6from natto import MeCab 7import codecs 8import sys 9import glob 10import pandas 11import urllib.request 12from gensim import corpora, models, similarities 13from itertools import chain 14 15with codecs.open("test40.txt", "r", "utf-8") as f: 16 corpus = f.read().split("\n") 17 18text = corpus.text 19text = re.sub(r"http\S+", "", text) 20text = re.sub(r"@(\w+) ", "", text) 21#text = re.sub(r"#(\w+)", "", text) 22text = re.sub(r"(^RT.*)", "", text, flags=re.MULTILINE | re.DOTALL) 23#絵文字も消したい 24emoji_pattern = re.compile("[" 25u"\U0001F600-\U0001F64F" 26u"\U0001F300-\U0001F5FF" 27u"\U0001F680-\U0001F6FF" 28u"\U0001F1E0-\U0001F1FF" 29"]+", flags=re.UNICODE) 30text = emoji_pattern.sub("", text) 31 32mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd') 33 34#if tagger.lang == 'ja': 35#名詞の抽出と整形 36rm_list = ["RT","https","co","さん","フォロー","本日","応募","今日","プレゼント","お金","FGO","無料","本人","投稿","動画","ツイート","リツイート","Twitter","ローソン","Peing","http","Amazonギフト券","bot","発売中","Youtube","www","WWW","質問箱","コラボ","フォロワー","DM","いいね","RT","lawson","://","!","peing","youtube","抽選","jp","リプ","キャンペーン","チケット","期間限定","DHC","日本","amp","人間","チャンネル","配信中","YouTube","WEB","楽しみ","イラスト","くじ","@","__"] 37 38stop_words = [] 39path = 'stop_words.txt' 40with open(path) as g: 41 stop_words = g.readlines() 42 43docs = [] 44for txt in corpus: 45 words = mecab.parse(txt, as_nodes=True) 46 doc = [] 47 48 for w in words: 49 if w.feature.split(",")[0] == "名詞": 50 if len(w.surface) >= 3: 51 if w.surface not in rm_list: 52 doc.append(w.surface) 53 54 docs.append(doc) 55corpus = docs 56 57#辞書の作成 58dictionary = corpora.Dictionary(corpus) 59dictionary.filter_extremes(no_below=2, no_above=0.01) 60 61# コーパスを作成 62corpus_c = [dictionary.doc2bow(corpus) for text in corpus] 63 64#トピックモデルを生成 65lda = gensim.models.ldamodel.LdaModel(corpus=corpus_c, num_topics=100) 66 67# 各トピックの出現頻度上位を取得 68topic_top = [] 69for topic in lda.show_topics(-1, formatted=False): 70 topic_top.append([dictionary[int(tag[0])] for tag in topic[1]]) 71 72# 各トピックの出現頻度上位10位をcsv形式で保存 73topic_data = pandas.DataFrame(topic_top) 74topic_data.to_csv("topic_words1.csv", encoding="utf-8")
補足情報(FW/ツールのバージョンなど)
iOS 10.12.6, Python 3.7.3, Atom
回答2件
あなたの回答
tips
プレビュー