自然言語処理：エラー箇所と改善方法を教えてください

発生している問題・エラーメッセージ

Traceback (most recent call last):
  File "test2.py", line 57, in <module>
    test_matrix = vectorizer.transform(test_data)
  File "C:\Users\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1410, in transform
    return self._tfidf.transform(X, copy=False)
  File "C:\Users\Anaconda3\lib\site-packages\sklearn\feature_extraction\text.py", line 1114, in transform
    X = normalize(X, norm=self.norm, copy=False)
  File "C:\Users\Anaconda3\lib\site-packages\sklearn\preprocessing\data.py", line 1412, in normalize
    estimator='the normalize function', dtype=FLOAT_DTYPES)
  File "C:\Users\Anaconda3\lib\site-packages\sklearn\utils\validation.py", line 462, in check_array
    context))
ValueError: Found array with 0 sample(s) (shape=(0, 85241)) while a minimum of 1 is required by the normalize function.

該当のソースコード

python
1import glob
2import time
3from sklearn.model_selection import train_test_split
4from janome.tokenizer import Tokenizer
5from sklearn.feature_extraction.text import TfidfVectorizer
6from sklearn.naive_bayes import MultinomialNB
7from sklearn.ensemble import RandomForestClassifier
8def load_livedoor_news_corpus():
9    category = {
10        'dokujo-tsushin': 1,
11        'it-life-hack':2,
12        'kaden-channel': 3,
13        'livedoor-homme': 4,
14        'movie-enter': 5,
15        'peachy': 6,
16        'smax': 7,
17        'sports-watch': 8,
18        'topic-news':9
19    }
20    docs  = []
21    labels = []
22
23    for c_name, c_id in category.items():
24        files = glob.glob("./text/{c_name}/{c_name}*.txt".format(c_name=c_name))
25
26        text = ''
27        for file in files:
28            with open(file, 'r', encoding="utf-8") as f:
29                lines = f.read().splitlines()
30
31                url = lines[0]
32                datetime = lines[1]
33                subject = lines[2]
34                body = "".join(lines[3:])
35                text = subject + body
36
37            docs.append(text)
38            labels.append(c_id)
39
40    return docs, labels
41
42docs, labels = load_livedoor_news_corpus()
43
44import random
45
46random.seed()
47indices = list(range(len(docs)))
48random.shuffle(indices)
49
50train_data   = [docs[i] for i in indices[0:7000]]
51train_labels = [labels[i] for i in indices[0:7000]]
52test_data    = [docs[i] for i in indices[7000:]]
53test_labels  = [labels[i] for i in indices[7000:]]
54
55vectorizer = TfidfVectorizer()
56train_matrix = vectorizer.fit_transform(train_data)
57test_matrix = vectorizer.transform(test_data)
58
59clf = MultinomiaINB
60model = clf.fit(train_matrix,train_labels)
61print(clf.score(train_matrix,train_labels))
62print(clf.score(test_matrix,test_labels))
63
64clf = RandomForestClassifier
65model = clf.fit(train_matrix,train_labels)
66print(clf.score(train_matrix,train_labels))
67print(clf.score(test_matrix,test_labels))
68
69
70def tokenize(text):
71    tokens = t.tokenize(','.join(text))
72    noun = []
73    for token in tokens:
74        partOfSpeech = token.part_of_speech.split(',')[0]
75        if partOfSpeech == "名詞":
76            noun.append(token.surface)
77        if partOfSpeech == "動詞":
78            noun.append(token.surface)
79        if partOfSpeech == "形容詞":
80            noun.append(token.surface)
81        if partOfSpeech == "形容動詞":
82            noun.append(token.surface)
83    return noun

試したこと

Found array with 0 sample(s) (shape=(0, 85241)) while a minimum of 1 is required by the normalize function.
正規化関数では最低1が必要ですが、0サンプルの配列が見つかりました
（shape =（0、85241））。