実現したいこととエラー
スクレイピングで鳥の画像を集めたいと思い、コードを調べて書いたのですが、
「Begining searching bird
-> No more images
-> Found 0 images
Complete downloaded
└─ Failed to download 0 images」
となってしまい、画像が集められません。何が原因なのか調べても全く分からなかったので質問させていただきました。
該当のソースコード
import
1import json 2import os 3import urllib 4 5from bs4 import BeautifulSoup 6import requests 7 8 9class Google(object): 10 def __init__(self): 11 self.GOOGLE_SEARCH_URL = "https://www.google.co.jp/search" 12 self.session = requests.session() 13 self.session.headers.update( 14 { 15 "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) \ 16 Gecko/20100101 Firefox/10.0" 17 } 18 ) 19 20 def search(self, keyword, maximum): 21 print(f"Begining searching {keyword}") 22 query = self.query_gen(keyword) 23 return self.image_search(query, maximum) 24 25 def query_gen(self, keyword): 26 # search query generator 27 page = 0 28 while True: 29 params = urllib.parse.urlencode( 30 {"q": keyword, "tbm": "isch", "ijn": str(page)} 31 ) 32 33 yield self.GOOGLE_SEARCH_URL + "?" + params 34 page += 1 35 36 def image_search(self, query_gen, maximum): 37 results = [] 38 total = 0 39 while True: 40 # search 41 html = self.session.get(next(query_gen)).text 42 soup = BeautifulSoup(html, "lxml") 43 elements = soup.select(".rg_meta.notranslate") 44 jsons = [json.loads(e.get_text()) for e in elements] 45 image_url_list = [js["ou"] for js in jsons] 46 47 # add search results 48 if not len(image_url_list): 49 print("-> No more images") 50 break 51 elif len(image_url_list) > maximum - total: 52 results += image_url_list[: maximum - total] 53 break 54 else: 55 results += image_url_list 56 total += len(image_url_list) 57 58 print("-> Found", str(len(results)), "images") 59 return results 60 61 62def main(): 63 parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) 64 parser.add_argument("-t", "--target", help="target name", type=str, required=True) 65 parser.add_argument( 66 "-n", "--number", help="number of images", type=int, required=True 67 ) 68 parser.add_argument( 69 "-d", "--directory", help="download location", type=str, default="./data" 70 ) 71 parser.add_argument( 72 "-f", 73 "--force", 74 help="download overwrite existing file", 75 type=bool, 76 default=False, 77 ) 78 79 args = parser.parse_args() 80 81 data_dir = args.directory 82 target_name = args.target 83 84 os.makedirs(data_dir, exist_ok=True) 85 os.makedirs(os.path.join(data_dir, target_name), exist_ok=args.force) 86 87 google = Google() 88 89 # search images 90 results = google.search(target_name, maximum=args.number) 91 92 # download 93 download_errors = [] 94 for i, url in enumerate(results): 95 print("-> Downloading image", str(i + 1).zfill(4), end=" ") 96 try: 97 urllib.request.urlretrieve( 98 url, 99 os.path.join(*[data_dir, target_name, str(i + 1).zfill(4) + ".jpg"]), 100 ) 101 print("successful") 102 except BaseException: 103 print("failed") 104 download_errors.append(i + 1) 105 continue 106 107 print("-" * 50) 108 print("Complete downloaded") 109 print("├─ Successful downloaded", len(results) - len(download_errors), "images") 110 print("└─ Failed to download", len(download_errors), "images", *download_errors) 111 112 113if __name__ == "__main__": 114 main()
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。
2020/03/17 06:33