回答編集履歴
1
追記
test
CHANGED
@@ -13,3 +13,103 @@
|
|
13
13
|
```
|
14
14
|
|
15
15
|
各文書ごとに特徴語を取り出したければ、こんな感じでいけると思います。
|
16
|
+
|
17
|
+
|
18
|
+
|
19
|
+
### 追記
|
20
|
+
|
21
|
+
せっかくなので簡単なサンプルを。
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
```python
|
26
|
+
|
27
|
+
import numpy as np
|
28
|
+
|
29
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
30
|
+
|
31
|
+
from sklearn.datasets import fetch_20newsgroups
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
news20 = fetch_20newsgroups()
|
36
|
+
|
37
|
+
vectorizer = TfidfVectorizer(min_df=0.03)
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
tfidf = vectorizer.fit_transform(news20.data[:1000]).toarray()
|
42
|
+
|
43
|
+
feature_names = np.array(vectorizer.get_feature_names())
|
44
|
+
|
45
|
+
index = tfidf.argsort(axis=1)[:,::-1]
|
46
|
+
|
47
|
+
feature_words = [feature_names[doc] for doc in index]
|
48
|
+
|
49
|
+
|
50
|
+
|
51
|
+
n = 5 # top何単語取るか
|
52
|
+
|
53
|
+
m = 10 # 何記事サンプルとして抽出するか
|
54
|
+
|
55
|
+
targets = np.array(news20.target_names)[news20.target[:m]]
|
56
|
+
|
57
|
+
|
58
|
+
|
59
|
+
for fwords, target in zip(feature_words, targets):
|
60
|
+
|
61
|
+
print(target)
|
62
|
+
|
63
|
+
print(fwords[:n])
|
64
|
+
|
65
|
+
|
66
|
+
|
67
|
+
""" =>
|
68
|
+
|
69
|
+
rec.autos
|
70
|
+
|
71
|
+
['car' 'was' 'this' 'the' 'where']
|
72
|
+
|
73
|
+
comp.sys.mac.hardware
|
74
|
+
|
75
|
+
['washington' 'add' 'guy' 'speed' 'call']
|
76
|
+
|
77
|
+
comp.sys.mac.hardware
|
78
|
+
|
79
|
+
['the' 'display' 'anybody' 'heard' 'disk']
|
80
|
+
|
81
|
+
comp.graphics
|
82
|
+
|
83
|
+
['division' 'chip' 'systems' 'computer' 'four']
|
84
|
+
|
85
|
+
sci.space
|
86
|
+
|
87
|
+
['error' 'known' 'tom' 'memory' 'the']
|
88
|
+
|
89
|
+
talk.politics.guns
|
90
|
+
|
91
|
+
['of' 'the' 'com' 'to' 'says']
|
92
|
+
|
93
|
+
sci.med
|
94
|
+
|
95
|
+
['thanks' 'couldn' 'instead' 'file' 'everyone']
|
96
|
+
|
97
|
+
comp.sys.ibm.pc.hardware
|
98
|
+
|
99
|
+
['chip' 'is' 'fast' 'ibm' 'bit']
|
100
|
+
|
101
|
+
comp.os.ms-windows.misc
|
102
|
+
|
103
|
+
['win' 'help' 'please' 'appreciated' 'figure']
|
104
|
+
|
105
|
+
comp.sys.mac.hardware
|
106
|
+
|
107
|
+
['the' 'file' 'lost' 've' 'it']
|
108
|
+
|
109
|
+
"""
|
110
|
+
|
111
|
+
```
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
それっぽく動いているようです。
|