ストップワードを除去しきれない

リアルタイムのツイート群から抽出した名詞を対象に、辞書を構築したいと考えています。
ストップワードを除去してコードを実行したところ、下記のValueErrorが出てしまいました。
これに関して、解決策を教えていただけませんでしょうか？
宜しくお願いします。

エラー

Traceback (most recent call last):
  File "final.py", line 117, in <module>
    stream.sample()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 449, in sample
    self._start(is_async)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 389, in _start
    self._run()
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 320, in _run
    six.reraise(*exc_info)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/six.py", line 693, in reraise
    raise value
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 289, in _run
    self._read_loop(resp)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 351, in _read_loop
    self._data(next_status_obj)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 323, in _data
    if self.listener.on_data(data) is False:
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/tweepy/streaming.py", line 54, in on_data
    if self.on_status(status) is False:
  File "final.py", line 99, in on_status
    tfidf = vectorizer.fit_transform(corpus)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1652, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 1058, in fit_transform
    self.fixed_vocabulary_)
  File "/Users/macuser/Workspaces/jxpress/trendword/.direnv/python-3.7.3/lib/python3.7/site-packages/sklearn/feature_extraction/text.py", line 989, in _count_vocab
    raise ValueError("empty vocabulary; perhaps the documents only"
ValueError: empty vocabulary; perhaps the documents only contain stop words

該当のソースコード

final.py
1import os
2import tweepy
3import redis
4import math
5from collections import Counter
6from collections import defaultdict
7import re
8from natto import MeCab
9import codecs
10import sys
11from sklearn.feature_extraction.text import TfidfVectorizer
12import glob
13import numpy as np
14import urllib.request
15from gensim import corpora
16from itertools import chain
17
18#frequency = defaultdict(int)
19
20#r = redis.Redis(host='localhost', port=6379, db=0)
21
22TWITTER_CLIENT_ID = os.environ['TWITTER_CLIENT_ID']
23TWITTER_CLIENT_SECRET = os.environ['TWITTER_CLIENT_SECRET']
24
25TWITTER_OAUTH_TOKEN = os.environ['TWITTER_OAUTH_TOKEN']
26TWITTER_OAUTH_TOKEN_SECRET = os.environ['TWITTER_OAUTH_TOKEN_SECRET']
27
28auth = tweepy.OAuthHandler(TWITTER_CLIENT_ID,TWITTER_CLIENT_SECRET)
29auth.set_access_token(TWITTER_OAUTH_TOKEN,TWITTER_OAUTH_TOKEN_SECRET)
30
31class StreamListener(tweepy.StreamListener):
32    def __init__(self):
33        super().__init__()
34        self.count = 0 # 取得したtweet数
35
36    def on_status(self, status):
37        text = str(status.text)
38        #https://www.pytry3g.com/entry/master-Preprocessing#正規表現を使う
39        text = re.sub(r"http\S+", "", text)
40        text = re.sub(r"@(\w+) ", "", text)
41        text = re.sub(r"#(\w+)", "", text)
42        #リツイートは消えていない
43        text = re.sub(r"(^RT.*)", "", text, flags=re.MULTILINE | re.DOTALL)
44        #絵文字も消したい
45        emoji_pattern = re.compile("["
46        u"\U0001F600-\U0001F64F"
47        u"\U0001F300-\U0001F5FF"
48        u"\U0001F680-\U0001F6FF"
49        u"\U0001F1E0-\U0001F1FF"
50        "]+", flags=re.UNICODE)
51        text = emoji_pattern.sub("", text)
52
53        #日本語ツイートをファイルに書き込んでいる + ツイートの数を表示
54        if status.lang == "ja":
55            self.count += 1
56            print(self.count, text)
57            with open("test45.txt", "a", encoding="utf-8") as f:
58                f.write(text)
59            with codecs.open("test45.txt", "r", "utf-8") as f:
60                corpus = f.read().split("\n")
61
62            mecab = MeCab('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')
63
64            #if tagger.lang == 'ja':
65            #名詞の抽出と整形
66            rm_list = ["RT","https","co","さん","フォロー","本日","応募","今日","プレゼント","お金","FGO","無料","本人","投稿","動画","ツイート","リツイート","Twitter","ローソン","Peing","http","Amazonギフト券","bot","発売中","Youtube","www","WWW","質問箱","コラボ","フォロワー","DM","いいね","ＲＴ","lawson","://","！","peing","youtube","抽選","jp","リプ","キャンペーン","チケット","期間限定","DHC","日本","amp","人間","チャンネル","配信中","YouTube","WEB","楽しみ","イラスト","くじ","@","__"]
67
68            stop_words = []
69            path = 'stop_words.txt'
70            with open(path) as g:
71                stop_words = g.readlines()
72
73            docs = []
74            for txt in corpus:
75                words = mecab.parse(txt, as_nodes=True)
76                doc = []
77
78                for w in words:
79                    if w.feature.split(",")[0] == "名詞":
80                        if len(w.surface) >= 3:
81                            if not any(rm in w.surface for rm in rm_list):
82                                doc.extend(str(w.surface))
83
84                docs.extend(doc)
85            corpus = docs
86
87            #辞書の構築
88            dictionary = corpora.Dictionary([corpus])
89            dictionary.filter_extremes(no_below=20, no_above=0.5)
90            dictionary.save_as_text('test40-dic8.txt')
91
92            #単語の頻出度の抽出
93            counter = Counter(chain.from_iterable(corpus))
94            for word, count in counter.most_common():
95                if count > 19:
96                    print(f"{word}: {count}")
97
98　　　  #tf-idf計算
99            vectorizer = TfidfVectorizer(min_df=0.02)
100            tfidf = vectorizer.fit_transform(corpus)
101
102            #スコアの表示
103            print(tfidf.toarray())
104            # テキストの数、出現した単語の数
105            print(tfidf.shape)
106
107            #並べ替え
108            feature_names = np.array(vectorizer.get_feature_names())
109            for vec in tfidf:
110                index = np.argsort(vec.toarray(), axis=1)[:,::-1]
111                feature_words = feature_names[index]
112                print(feature_words[:,:10])
113
114    def on_error(self, status_code):
115        return False
116
117stream = tweepy.Stream(auth=auth, listener=StreamListener())
118stream.sample()

補足情報（FW/ツールのバージョンなど）

iOS 10.12.6, Python 3.7.3, Atom

行動規範の内容に同意します

回答1件

ベストアンサー

ストップワードどうこうではなくて、データの作り方が間違っているように思います。doc.extend(str(w.surface))ではなくdoc.append(w.surface)に、docs.extend(doc)ではなくdocs.append(doc)にしてください。TfidfVectorizerの側ではTfidfVectorizer(min_df=0.02, analyzer=lambda x:x)とするなどして調整してください。

あと、if not any(rm in w.surface for rm in rm_list):の部分も意図がよくわかりません。w.surface not in rm_listとかでいいのでは。

投稿2019/08/19 07:08