スクレイピングで画像収集をしていて下記に示すようなエラーが出てしまいました.なぜかわかりません.わかる方教えてください.よろしくお願いします.
requests.exceptions.ConnectionError: HTTPConnectionPool(host='localhost', port=8050): Max retries exceeded with url: /render.html?url=https://www.google.co.jp/search?q=Ironman&tbm=isch&ijn=0 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002BA29BB12E8>: Failed to establish a new connection: [WinError 10061] 対象のコンピュー ターによって拒否されたため、接続できませんでした。'))
コードを以下に示します
python
1import json 2import os 3import sys 4import urllib 5from bs4 import BeautifulSoup 6import requests,time 7 8class Google: 9 def __init__(self): 10 self.GOOGLE_SEARCH_URL = 'http://localhost:8050/render.html?url=https://www.google.co.jp/search' 11 self.session = requests.session() 12 self.session.headers.update( 13 {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0'}) 14 15 def search(self,keyword,maximum): 16 print('begin searching',keyword) 17 query = self.query_gen(keyword) 18 return self.image_search(query,maximum) 19 20 #検索キーワードからクエリ付与したURLを生成 21 def query_gen(self,keyword): 22 #search query generator 23 page = 0 24 while True: 25 params = urllib.parse.urlencode({ 26 'q':keyword, 27 'tbm':'isch',#画像検索を指定 28 'ijn':str(page)}) 29 yield self.GOOGLE_SEARCH_URL + '?' +params 30 page += 1 31 32 33 #画像検索のURL(query_gen)と画像取得枚数を指定 34 def image_search(self,query_gen,maximum): 35 #search image 36 result = [] 37 total = 0 38 while True: 39 # search 40 html = self.session.get(next(query_gen)).text #画像検索結果のページのhtmlを取得 41 soup = BeautifulSoup(html,'lxml') 42 elements = soup.select('.rg_meta.notrranslate') #指定したクラスをすべて取得 43 jsons = [json.loads(e.get_text()) for e in elements] #取得したクラスから要素を取得しjsonから辞書形式に変換 44 imgURLs = [js['ou'] for js in jsons] #画像のURLを取得 45 46 # odd search result 47 if not len(imageURLs): 48 print('-> no more image') 49 break 50 elif len(imageURLs) > maximum - total: 51 results += imageURLs[:maximum -total] 52 break 53 else: 54 result += imageURLs 55 total += len(imageURLs) 56 57 print('-> found', str(len(result)), 'image') 58 return result 59 60def main(): 61 google = Google() 62 # error 63 if len(sys.argv) != 3: 64 print('invalid argment') 65 print('> ./image_scraip.py [target name] [download number]') 66 sys.exit() 67 else: 68 #save location 69 name = sys.argv[1] 70 data_dir = 'data/' 71 os.makedirs(data_dir,exist_ok=True) 72 os.makedirs('data/' + name,exist_ok=True) 73 #search image 74 result = google.search( 75 name,maximum=int(sys.argv[2]) 76 ) 77 78 #download 79 download_error = [] 80 for i in range(len(result)): 81 print('-> downloading image',str(i + 1).zfill(4)) 82 try:#画像をダウンロードしてローカルに保存 83 urllib.request.urlretrive( 84 result[i],data_dir + name + '/' + str(i + 1).zfill(4) + '.jpg' 85 ) 86 time.sleep(5) 87 except: 88 print('--> could not download image ',str(i + 1).zfill(4)) 89 down_load.apped(i + 1) 90 continue 91 print('compelte download') 92 print('|- download',len(result)-len(download_error),'images') 93 print('|_ could not download ',len(download_error),'images',download_error) 94 95if __name__ == '__main__': 96 main() 97
また別で
python
1import argparse 2import json 3import os 4import urllib 5 6from bs4 import BeautifulSoup 7import requests 8 9 10class Google(object): 11 def __init__(self): 12 self.GOOGLE_SEARCH_URL = "https://www.google.co.jp/search" 13 self.session = requests.session() 14 self.session.headers.update( 15 { 16 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) \ 17 Gecko/20100101 Firefox/10.0" 18 } 19 ) 20 21 def search(self, keyword, maximum): 22 print(f"Begining searching {keyword}") 23 query = self.query_gen(keyword) 24 return self.image_search(query, maximum) 25 26 def query_gen(self, keyword): 27 # search query generator 28 page = 0 29 while True: 30 params = urllib.parse.urlencode( 31 {"q": keyword, "tbm": "isch", "ijn": str(page)} 32 ) 33 34 yield self.GOOGLE_SEARCH_URL + "?" + params 35 page += 1 36 37 def image_search(self, query_gen, maximum): 38 results = [] 39 total = 0 40 while True: 41 # search 42 html = self.session.get(next(query_gen)).text 43 soup = BeautifulSoup(html, "lxml") 44 elements = soup.select(".rg_meta.notranslate") 45 jsons = [json.loads(e.get_text()) for e in elements] 46 image_url_list = [js["ou"] for js in jsons] 47 48 # add search results 49 if not len(image_url_list): 50 print("-> No more images") 51 break 52 elif len(image_url_list) > maximum - total: 53 results += image_url_list[: maximum - total] 54 break 55 else: 56 results += image_url_list 57 total += len(image_url_list) 58 59 print("-> Found", str(len(results)), "images") 60 return results 61 62 63def main(): 64 parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) 65 parser.add_argument("-t", "--target", help="target name", type=str, required=True) 66 parser.add_argument( 67 "-n", "--number", help="number of images", type=int, required=True 68 ) 69 parser.add_argument( 70 "-d", "--directory", help="download location", type=str, default="./data" 71 ) 72 parser.add_argument( 73 "-f", 74 "--force", 75 help="download overwrite existing file", 76 type=bool, 77 default=False, 78 ) 79 80 args = parser.parse_args() 81 82 data_dir = args.directory 83 target_name = args.target 84 85 os.makedirs(data_dir, exist_ok=True) 86 os.makedirs(os.path.join(data_dir, target_name), exist_ok=args.force) 87 88 google = Google() 89 90 # search images 91 results = google.search(target_name, maximum=args.number) 92 93 # download 94 download_errors = [] 95 for i, url in enumerate(results): 96 print("-> Downloading image", str(i + 1).zfill(4), end=" ") 97 try: 98 urllib.request.urlretrieve( 99 url, 100 os.path.join(*[data_dir, target_name, str(i + 1).zfill(4) + ".jpg"]), 101 ) 102 print("successful") 103 except BaseException: 104 print("failed") 105 download_errors.append(i + 1) 106 continue 107 108 print("-" * 50) 109 print("Complete downloaded") 110 print("├─ Successful downloaded", len(results) - len(download_errors), "images") 111 print("└─ Failed to download", len(download_errors), "images", *download_errors) 112 113 114if __name__ == "__main__": 115 main()
こちらを実行するとできるのですがなぜだかわかりますでしょうか?

回答2件
あなたの回答
tips
プレビュー
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。
2019/06/17 07:15
2019/06/17 07:22 編集
2019/06/17 07:29
2019/06/17 07:29
2019/06/17 07:45 編集
2019/06/19 05:27