回答編集履歴
1
"Some characters could not be decoded 〜" 発生時の処理を変更
test
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
> ひとつは、コンテンツが html であるべきところ、xml になっている。
|
2
2
|
|
3
|
-
`warning` モジュールを利用して、warning をエラー扱いにする方法があります。
|
3
|
+
`warnings` モジュールを利用して、warning をエラー扱いにする方法があります。
|
4
4
|
|
5
5
|
> Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
|
6
6
|
>
|
@@ -15,12 +15,14 @@
|
|
15
15
|
```python
|
16
16
|
import requests
|
17
17
|
import warnings
|
18
|
+
import logging
|
18
19
|
from bs4 import BeautifulSoup
|
19
20
|
from bs4.builder import XMLParsedAsHTMLWarning
|
20
21
|
|
21
|
-
# Warning を例外(exception)化
|
22
22
|
warnings.resetwarnings()
|
23
23
|
warnings.simplefilter('error')
|
24
|
+
logger = logging.getLogger('bs4.dammit')
|
25
|
+
logger.disabled = True
|
24
26
|
|
25
27
|
urls = [
|
26
28
|
'https://teratail.com/',
|
@@ -31,18 +33,18 @@
|
|
31
33
|
for url in urls:
|
32
34
|
res = requests.get(url)
|
33
35
|
res.raise_for_status()
|
36
|
+
|
34
37
|
try:
|
35
38
|
soup = BeautifulSoup(res.content, 'html.parser')
|
36
39
|
if soup.contains_replacement_characters:
|
37
|
-
print(url)
|
40
|
+
print(f'Some characters could not be decoded: {url}')
|
38
41
|
except XMLParsedAsHTMLWarning:
|
39
|
-
print(f'XMLParsedAsHTMLWarning
|
42
|
+
print(f'XMLParsedAsHTMLWarning: {url}')
|
40
43
|
except e:
|
41
44
|
print(f'{url}: {e}')
|
42
45
|
|
43
46
|
|
44
|
-
# XMLParsedAsHTMLWarning
|
47
|
+
# XMLParsedAsHTMLWarning: https://www.sitemaps.org/sitemap.xml
|
45
|
-
# Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
|
46
|
-
# https://www.jleague.jp/standings/j1/
|
48
|
+
# Some characters could not be decoded: https://www.jleague.jp/standings/j1/
|
47
49
|
```
|
48
50
|
|