###前提・実現したいこと
webページのスクレイピングを複数回行いたいです。
具体的には、コマンドプロンプト上でpython ~.py プロレス 1と実行すれば
グーグルでプロレスと検索したときに表示される1ページ目の10件すべてをスクレイピングし、テキストとして保存する。
上記の事を実現したいです。
###発生している問題・エラーメッセージ
何かしらの形式がおかしいため、発生しているのかなと思っています。
1 Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type' Traceback (most recent call last): File "bs.py", line 69, in <module> soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\RYO\AppData\Local\Programs\Python\Python36\lib\urllib\request.p y", line 518, in open protocol = req.type AttributeError: 'HTTPResponse' object has no attribute 'type'
###該当のソースコード
python
1# -*- coding: utf-8 -*- 2#BeautifulSoupとutllibをインポート 3#from bs4 import BeautifulSoup 4import urllib.request 5import urllib.parse 6import bs4 7from bs4 import NavigableString,Declaration,Comment,BeautifulSoup 8#コマンドライン引数指定 (1つのみ) 9import sys 10 11import ssl 12ssl._create_default_https_context = ssl._create_unverified_context 13 14links = [] #ページにあるリンク10個を格納する 15maintexts = [] #txtファイルに書き込む文章 16 17def getNavigableStrings(soup): 18 if isinstance(soup, NavigableString): 19 if type(soup) not in (Comment, Declaration) and soup.strip(): 20 yield soup 21 elif soup.name not in ('script', 'style'): 22 for c in soup.contents: 23 for g in getNavigableStrings(c): 24 yield g 25#html抽出関数 26def htmlExtraction(url): 27# HTML を取得 28 opener = urllib.request.build_opener() 29 opener.addheaders = [('User-agent', 'Mozilla/5.0')] 30 html = opener.open(url) 31 soup = BeautifulSoup(html,"html.parser") 32 33 for url in soup.find_all("h3",{"class":"r"}): 34 links.extend([url.a.get("href")]) 35 36query = "" #query encode 37queryary = [] #query word 38querytxt = "" #txtファイル名 39pagenum = int(sys.argv[len(sys.argv) - 1]) #ページ番号 40page = "" #何ページ目を参照するか 41 42for i in range(1,len(sys.argv) - 1): 43 queryary.extend([urllib.parse.quote(sys.argv[i])]) #sys.argv[1]~sys.argv[max] 44 querytxt = querytxt + sys.argv[i] + " " 45 if i == 1: 46 query = queryary[i-1] 47 else: 48 query = query + "+" + queryary[i-1] 49 50querytxt = querytxt + sys.argv[len(sys.argv) - 1] 51 52url1 = "https://www.google.co.jp/search?q=" 53 54if int(sys.argv[len(sys.argv) - 1]) >= 2: #最後の引数はページ数 1ページ目参照の場合不要 55 page = "&start=" + str((pagenum - 1) * 10) #最後の引数が2なら11件目から20件目を参照 56url = url1 + query + page 57 58htmlExtraction(url) 59 60linkcnt = 0 61#本文抽出 62for link in links: 63 linkcnt += 1 64 try: 65 opener = urllib.request.build_opener() 66 opener.addheaders = [('User-agent', 'Mozilla/5.0')] 67 html = opener.open("https://www.google.co.jp" + link) 68 #soup = BeautifulSoup(html, "html.parser") 69 soup = bs4.BeautifulSoup(urllib.request.urlopen(html).read(),"html.parser") 70 text = '\n'.join(getNavigableStrings(soup)) 71 72 maintexts.extend([text]) #配列に入れてく 73 74 except urllib.error.URLError: 75 print(str(linkcnt) + "page URLerror") 76 except: 77 import traceback 78 traceback.print_exc() 79 80 81 82 83#ファイルに書き込み w:上書き a:追記 84import codecs 85file_object= codecs.open(querytxt + ".txt", "wb", "cp932", "ignore") 86file_object.write(str(maintexts) + "\n") 87file_object.close() 88
###補足情報(言語/FW/ツール等のバージョンなど)
より詳細な情報
回答1件
あなたの回答
tips
プレビュー