質問編集履歴

コードを追加しました

2017/07/09 13:29

投稿

takumiURIRIN

スコア10

test CHANGED Viewed

	@@ -1 +1 @@
1	- Seleniumでスクレイピングする際にタイムアウトしてしまう
1	+ Selenium/Pythonでスクレイピングする際にタイムアウトして処理が止まってしまう

test CHANGED Viewed

@@ -28,9 +28,7 @@
 ---> 10 driver.get(link)
-     11 driver.set_script_timeout(10)
-     12
+     11
@@ -88,6 +86,248 @@
 ###コード
+```Python
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import time
+import re
+import csv
+#Open Chrome, Visit the target page
+driver = webdriver.Chrome()
+link = 'https://example.com/'
+driver.get(link)
+#Create array
+data_list = []
+property_names = []
+property_links = []
+bed_numbers = []
+bath_numbers = []
+room_sizes = []
-こちらのサイトを元にコードは書きました。
+property_prices = []
+#Get total number of properties
+data = driver.page_source.encode('utf-8')
+soup = BeautifulSoup(data,"lxml")
+property_number = soup.find("div", class_="class")
+#Calculate total pages
+page_number = int(int(property_number.text.split(" ")[0]) / 10) + 1
+print(page_number)
+#File open
+f = open('output.csv', 'w')
+writer = csv.writer(f, lineterminator='\n', delimiter=';')
+for i in range(page_number):
-http://qiita.com/kinpira/items/383b0fbee6bf229ea03d
+    link_base = 'https://example.com/?page='
+    link = link_base + str(i + 1)
+    print(link)
+    driver.get(link)
+    time.sleep(5)
+    #Convert source code to lxml
+    data = driver.page_source.encode('utf-8')
+    soup = BeautifulSoup(data,"lxml")
+    for j in range(10):
+        property_listing = soup.select("div[data]")
+        #Get property name
+        property_names.append(property_listing[j].find_all("h2")[0].text)
+        #Get property link
+        property_links.append(property_listing[j].find_all("a")[0].get("href"))
+        #Get bed number, bath number, room size
+        string = property_listing[j].select(".listing")[0]
+        index = str(string).find("bed")
+        if index != -1:
+            bed_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip()))
+            index = str(string).find("bath")
+            if index != -1:
+                bath_numbers.append(int(property_listing[j].select(".listing")[1].text.lstrip()))
+                index = str(string).find("sqm")
+                if index != -1:
+                    room_sizes.append(int(property_listing[j].select(".listing")[2].text.lstrip().split("m")[0]))
+                else:
+                    room_sizes.append("")
+            else:
+                bath_numbers.append("")
+                index = str(string).find("sqm")
+                if index != -1:
+                    room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0]))
+                else:
+                    room_sizes.append("")
+        else:
+            bed_numbers.append("")
+            index = str(string).find("bath")
+            if index != -1:
+                bath_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip()))
+                index = str(string).find("sqm")
+                if index != -1:
+                    room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0]))
+                else:
+                    room_sizes.append("")
+            else:
+                bath_numbers.append("")
+                index = str(string).find("sqm")
+                if index != -1:
+                    room_sizes.append(int(property_listing[j].select(".listing")[0].text.lstrip().split("m")[0]))
+                else:
+                    room_sizes.append("")
+        #Get property price
+        string = property_listing[j]
+        index = str(string).find("listing-price")
+        if index != -1:
+            property_prices.append(str(property_listing[j].select(".listing-price")[0].text.lstrip().split("\n")[0]))
+        else:
+            property_prices.append("")
+    #Wait for JS running
+    time.sleep(5)
+print(property_names)
+print(property_links)
+print(bed_numbers)
+print(bath_numbers)
+print(room_sizes)
+print(property_prices)
+body = [property_names, property_links, bed_numbers, bath_numbers, room_sizes, property_prices]
+# Output
+writer.writerows(body)
+# Close file
+f.close()
+driver.quit()
+```