python
1tfidf = vectorizer.fit_transform(wakatilist).toarray()
2feature_names = np.array(vectorizer.get_feature_names())
3index = tfidf.argsort(axis=1)[:,::-1]
4n = 10 # いくつほしいか
5feature_words = [feature_names[doc[:n]] for doc in index]
各文書ごとに特徴語を取り出したければ、こんな感じでいけると思います。
追記
せっかくなので簡単なサンプルを。
python
1import numpy as np
2from sklearn.feature_extraction.text import TfidfVectorizer
3from sklearn.datasets import fetch_20newsgroups
4
5news20 = fetch_20newsgroups()
6vectorizer = TfidfVectorizer(min_df=0.03)
7
8tfidf = vectorizer.fit_transform(news20.data[:1000]).toarray()
9feature_names = np.array(vectorizer.get_feature_names())
10index = tfidf.argsort(axis=1)[:,::-1]
11feature_words = [feature_names[doc] for doc in index]
12
13n = 5 # top何単語取るか
14m = 10 # 何記事サンプルとして抽出するか
15targets = np.array(news20.target_names)[news20.target[:m]]
16
17for fwords, target in zip(feature_words, targets):
18 print(target)
19 print(fwords[:n])
20
21""" =>
22rec.autos
23['car' 'was' 'this' 'the' 'where']
24comp.sys.mac.hardware
25['washington' 'add' 'guy' 'speed' 'call']
26comp.sys.mac.hardware
27['the' 'display' 'anybody' 'heard' 'disk']
28comp.graphics
29['division' 'chip' 'systems' 'computer' 'four']
30sci.space
31['error' 'known' 'tom' 'memory' 'the']
32talk.politics.guns
33['of' 'the' 'com' 'to' 'says']
34sci.med
35['thanks' 'couldn' 'instead' 'file' 'everyone']
36comp.sys.ibm.pc.hardware
37['chip' 'is' 'fast' 'ibm' 'bit']
38comp.os.ms-windows.misc
39['win' 'help' 'please' 'appreciated' 'figure']
40comp.sys.mac.hardware
41['the' 'file' 'lost' 've' 'it']
42"""
それっぽく動いているようです。
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。