###実現したいこと:Selenium/PythonでWEBサイトをスクレイピングしたい
Selenium/Pythonを用いてWEBサイトのスクレイピングを行っています。
スクレイピング自体は問題なくできているのですが、スクレイピングするページ数が多く途中で停止してしまいます。
最後までスクレイピングを行えるようにしたいです。
###発生している問題・エラーメッセージ:Timeoutしてしまう
--------------------------------------------------------------------------- TimeoutException Traceback (most recent call last) <ipython-input-11-66fb62304bd9> in <module>() 8 driver = webdriver.Chrome() 9 link = 'example.com' ---> 10 driver.get(link) 11 ~/.pyenv/versions/3.6.0/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in get(self, url) 266 Loads a web page in the current browser session. 267 """ --> 268 self.execute(Command.GET, {'url': url}) 269 270 @property ~/.pyenv/versions/3.6.0/lib/python3.6/site-packages/selenium/webdriver/remote/webdriver.py in execute(self, driver_command, params) 254 response = self.command_executor.execute(driver_command, params) 255 if response: --> 256 self.error_handler.check_response(response) 257 response['value'] = self._unwrap_value( 258 response.get('value', None)) ~/.pyenv/versions/3.6.0/lib/python3.6/site-packages/selenium/webdriver/remote/errorhandler.py in check_response(self, response) 192 elif exception_class == UnexpectedAlertPresentException and 'alert' in value: 193 raise exception_class(message, screen, stacktrace, value['alert'].get('text')) --> 194 raise exception_class(message, screen, stacktrace) 195 196 def _value_or_default(self, obj, key, default): TimeoutException: Message: timeout (Session info: chrome=59.0.3071.115) (Driver info: chromedriver=2.30.477690 (c53f4ad87510ee97b5c3425a14c0e79780cdf262),platform=Mac OS X 10.12.4 x86_64)
###コード
Python
1from selenium import webdriver 2from bs4 import BeautifulSoup 3import time 4import re 5import csv 6 7#Open Chrome, Visit the target page 8driver = webdriver.Chrome() 9link = 'https://example.com/' 10driver.get(link) 11 12#Create array 13data_list = [] 14property_names = [] 15property_links = [] 16bed_numbers = [] 17bath_numbers = [] 18room_sizes = [] 19property_prices = [] 20 21#Get total number of properties 22data = driver.page_source.encode('utf-8') 23soup = BeautifulSoup(data,"lxml") 24property_number = soup.find("div", class_="class") 25 26#Calculate total pages 27page_number = int(int(property_number.text.split(" ")[0]) / 10) + 1 28print(page_number) 29 30#File open 31f = open('output.csv', 'w') 32writer = csv.writer(f, lineterminator='\n', delimiter=';') 33 34for i in range(page_number): 35 link_base = 'https://example.com/?page=' 36 link = link_base + str(i + 1) 37 print(link) 38 driver.get(link) 39 40 time.sleep(5) 41 42 #Convert source code to lxml 43 data = driver.page_source.encode('utf-8') 44 soup = BeautifulSoup(data,"lxml") 45 46 for j in range(10): 47 property_listing = soup.select("div[data]") 48 49 #Get property name 50 property_names.append(property_listing[j].find_all("h2")[0].text) 51 52 #Get property link 53 property_links.append(property_listing[j].find_all("a")[0].get("href")) 54 55 #Get bed number, bath number, room size 56 string = property_listing[j].select(".listing")[0] 57 index = str(string).find("bed") 58 if index != -1: 59 bed_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip())) 60 index = str(string).find("bath") 61 if index != -1: 62 bath_numbers.append(int(property_listing[j].select(".listing")[1].text.lstrip())) 63 index = str(string).find("sqm") 64 if index != -1: 65 room_sizes.append(int(property_listing[j].select(".listing")[2].text.lstrip().split("m")[0])) 66 else: 67 room_sizes.append("") 68 else: 69 bath_numbers.append("") 70 index = str(string).find("sqm") 71 if index != -1: 72 room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0])) 73 else: 74 room_sizes.append("") 75 76 else: 77 bed_numbers.append("") 78 index = str(string).find("bath") 79 if index != -1: 80 bath_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip())) 81 index = str(string).find("sqm") 82 if index != -1: 83 room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0])) 84 else: 85 room_sizes.append("") 86 else: 87 bath_numbers.append("") 88 index = str(string).find("sqm") 89 if index != -1: 90 room_sizes.append(int(property_listing[j].select(".listing")[0].text.lstrip().split("m")[0])) 91 else: 92 room_sizes.append("") 93 94 #Get property price 95 string = property_listing[j] 96 97 index = str(string).find("listing-price") 98 if index != -1: 99 property_prices.append(str(property_listing[j].select(".listing-price")[0].text.lstrip().split("\n")[0])) 100 else: 101 property_prices.append("") 102 103 #Wait for JS running 104 time.sleep(5) 105 106print(property_names) 107print(property_links) 108print(bed_numbers) 109print(bath_numbers) 110print(room_sizes) 111print(property_prices) 112 113body = [property_names, property_links, bed_numbers, bath_numbers, room_sizes, property_prices] 114 115# Output 116writer.writerows(body) 117 118# Close file 119f.close() 120 121driver.quit()
あなたの回答
tips
プレビュー