質問編集履歴
1
これが全文になります。環境はPython 3.7.4 でVScodeにてコードを書いています。
test
CHANGED
File without changes
|
test
CHANGED
@@ -1,6 +1,18 @@
|
|
1
1
|
RSSのフィードをパースしようとしてまずはフィードから全ての単語を取り出す関数を作ろうとしたのですがVScode上で以下のようなエラーが発生します。エラーを調べましたが
|
2
2
|
|
3
3
|
```python
|
4
|
+
|
5
|
+
import feedparser
|
6
|
+
|
7
|
+
import re
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
# RSSフィードのタイトルと、単語の頻度のディクショナリを返す
|
12
|
+
|
13
|
+
|
14
|
+
|
15
|
+
|
4
16
|
|
5
17
|
def getwordcounts(url):
|
6
18
|
|
@@ -36,11 +48,93 @@
|
|
36
48
|
|
37
49
|
return d.feed.title, wc
|
38
50
|
|
39
|
-
```
|
40
51
|
|
41
|
-
```
|
42
52
|
|
53
|
+
|
54
|
+
|
55
|
+
def getwords(html):
|
56
|
+
|
57
|
+
# 全てのHTMLタグを取り除く
|
58
|
+
|
59
|
+
txt = re.compile(r'<[^>]+>').sub('', html)
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
# 全ての日アルファベット文字で分割する
|
64
|
+
|
65
|
+
words = re.compile(r'[^A-Z^a-z]+').split(txt)
|
66
|
+
|
67
|
+
# 小文字に変換する
|
68
|
+
|
69
|
+
return [word.lower() for word in words if word != '']
|
70
|
+
|
71
|
+
|
72
|
+
|
73
|
+
apcount = {}
|
74
|
+
|
75
|
+
wordcounts = {}
|
76
|
+
|
77
|
+
feedlist = [line for line in open('feedlist.txt')]
|
78
|
+
|
79
|
+
for feedurl in feedlist:
|
80
|
+
|
81
|
+
try:
|
82
|
+
|
83
|
+
title, wc = getwordcounts(feedurl)
|
84
|
+
|
85
|
+
wordcounts[title] = wc
|
86
|
+
|
87
|
+
for word, count in wc.items():
|
88
|
+
|
89
|
+
apcount.setdefault(word, 0)
|
90
|
+
|
91
|
+
if count > 1:
|
92
|
+
|
93
|
+
apcount[word] += 1
|
94
|
+
|
95
|
+
except:
|
96
|
+
|
97
|
+
print('Failed to parse feed %s' % feedurl)
|
98
|
+
|
99
|
+
wordlist = []
|
100
|
+
|
101
|
+
for w, bc in apcount.items():
|
102
|
+
|
103
|
+
frac = float(bc) / len(feedlist)
|
104
|
+
|
105
|
+
if frac > 0.1 and frac < 0.5:
|
106
|
+
|
107
|
+
wordlist.append(w)
|
108
|
+
|
109
|
+
|
110
|
+
|
43
|
-
|
111
|
+
out = open('blogdata.txt', 'w')
|
112
|
+
|
113
|
+
out.write('Blog')
|
114
|
+
|
115
|
+
for word in wordlist:
|
116
|
+
|
117
|
+
out.write('\t%d' % wc[word])
|
118
|
+
|
119
|
+
out.write('\n')
|
120
|
+
|
121
|
+
for blog, wc in wordcounts.items():
|
122
|
+
|
123
|
+
out.write(blog)
|
124
|
+
|
125
|
+
for word in wordlist:
|
126
|
+
|
127
|
+
if word in wc:
|
128
|
+
|
129
|
+
out.write('\t%d' % wc[word])
|
130
|
+
|
131
|
+
else:
|
132
|
+
|
133
|
+
out.write('\t0')
|
134
|
+
|
135
|
+
out.write('\n')
|
136
|
+
|
137
|
+
|
44
138
|
|
45
139
|
```
|
46
140
|
|