質問編集履歴
1
コードを追加しました
test
CHANGED
@@ -1 +1 @@
|
|
1
|
-
Seleniumでスクレイピングする際にタイムアウトしてしまう
|
1
|
+
Selenium/Pythonでスクレイピングする際にタイムアウトして処理が止まってしまう
|
test
CHANGED
@@ -28,9 +28,7 @@
|
|
28
28
|
|
29
29
|
---> 10 driver.get(link)
|
30
30
|
|
31
|
-
11 driver.set_script_timeout(10)
|
32
|
-
|
33
|
-
1
|
31
|
+
11
|
34
32
|
|
35
33
|
|
36
34
|
|
@@ -88,6 +86,248 @@
|
|
88
86
|
|
89
87
|
###コード
|
90
88
|
|
89
|
+
```Python
|
90
|
+
|
91
|
+
from selenium import webdriver
|
92
|
+
|
93
|
+
from bs4 import BeautifulSoup
|
94
|
+
|
95
|
+
import time
|
96
|
+
|
97
|
+
import re
|
98
|
+
|
99
|
+
import csv
|
100
|
+
|
101
|
+
|
102
|
+
|
103
|
+
#Open Chrome, Visit the target page
|
104
|
+
|
105
|
+
driver = webdriver.Chrome()
|
106
|
+
|
107
|
+
link = 'https://example.com/'
|
108
|
+
|
109
|
+
driver.get(link)
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
#Create array
|
114
|
+
|
115
|
+
data_list = []
|
116
|
+
|
117
|
+
property_names = []
|
118
|
+
|
119
|
+
property_links = []
|
120
|
+
|
121
|
+
bed_numbers = []
|
122
|
+
|
123
|
+
bath_numbers = []
|
124
|
+
|
125
|
+
room_sizes = []
|
126
|
+
|
91
|
-
|
127
|
+
property_prices = []
|
128
|
+
|
129
|
+
|
130
|
+
|
92
|
-
|
131
|
+
#Get total number of properties
|
132
|
+
|
133
|
+
data = driver.page_source.encode('utf-8')
|
134
|
+
|
135
|
+
soup = BeautifulSoup(data,"lxml")
|
136
|
+
|
137
|
+
property_number = soup.find("div", class_="class")
|
138
|
+
|
139
|
+
|
140
|
+
|
141
|
+
#Calculate total pages
|
142
|
+
|
143
|
+
page_number = int(int(property_number.text.split(" ")[0]) / 10) + 1
|
144
|
+
|
145
|
+
print(page_number)
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
#File open
|
150
|
+
|
151
|
+
f = open('output.csv', 'w')
|
152
|
+
|
153
|
+
writer = csv.writer(f, lineterminator='\n', delimiter=';')
|
154
|
+
|
155
|
+
|
156
|
+
|
157
|
+
for i in range(page_number):
|
158
|
+
|
93
|
-
http://
|
159
|
+
link_base = 'https://example.com/?page='
|
160
|
+
|
161
|
+
link = link_base + str(i + 1)
|
162
|
+
|
163
|
+
print(link)
|
164
|
+
|
165
|
+
driver.get(link)
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
time.sleep(5)
|
170
|
+
|
171
|
+
|
172
|
+
|
173
|
+
#Convert source code to lxml
|
174
|
+
|
175
|
+
data = driver.page_source.encode('utf-8')
|
176
|
+
|
177
|
+
soup = BeautifulSoup(data,"lxml")
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
for j in range(10):
|
182
|
+
|
183
|
+
property_listing = soup.select("div[data]")
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
#Get property name
|
188
|
+
|
189
|
+
property_names.append(property_listing[j].find_all("h2")[0].text)
|
190
|
+
|
191
|
+
|
192
|
+
|
193
|
+
#Get property link
|
194
|
+
|
195
|
+
property_links.append(property_listing[j].find_all("a")[0].get("href"))
|
196
|
+
|
197
|
+
|
198
|
+
|
199
|
+
#Get bed number, bath number, room size
|
200
|
+
|
201
|
+
string = property_listing[j].select(".listing")[0]
|
202
|
+
|
203
|
+
index = str(string).find("bed")
|
204
|
+
|
205
|
+
if index != -1:
|
206
|
+
|
207
|
+
bed_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip()))
|
208
|
+
|
209
|
+
index = str(string).find("bath")
|
210
|
+
|
211
|
+
if index != -1:
|
212
|
+
|
213
|
+
bath_numbers.append(int(property_listing[j].select(".listing")[1].text.lstrip()))
|
214
|
+
|
215
|
+
index = str(string).find("sqm")
|
216
|
+
|
217
|
+
if index != -1:
|
218
|
+
|
219
|
+
room_sizes.append(int(property_listing[j].select(".listing")[2].text.lstrip().split("m")[0]))
|
220
|
+
|
221
|
+
else:
|
222
|
+
|
223
|
+
room_sizes.append("")
|
224
|
+
|
225
|
+
else:
|
226
|
+
|
227
|
+
bath_numbers.append("")
|
228
|
+
|
229
|
+
index = str(string).find("sqm")
|
230
|
+
|
231
|
+
if index != -1:
|
232
|
+
|
233
|
+
room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0]))
|
234
|
+
|
235
|
+
else:
|
236
|
+
|
237
|
+
room_sizes.append("")
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
else:
|
242
|
+
|
243
|
+
bed_numbers.append("")
|
244
|
+
|
245
|
+
index = str(string).find("bath")
|
246
|
+
|
247
|
+
if index != -1:
|
248
|
+
|
249
|
+
bath_numbers.append(int(property_listing[j].select(".listing")[0].text.lstrip()))
|
250
|
+
|
251
|
+
index = str(string).find("sqm")
|
252
|
+
|
253
|
+
if index != -1:
|
254
|
+
|
255
|
+
room_sizes.append(int(property_listing[j].select(".listing")[1].text.lstrip().split("m")[0]))
|
256
|
+
|
257
|
+
else:
|
258
|
+
|
259
|
+
room_sizes.append("")
|
260
|
+
|
261
|
+
else:
|
262
|
+
|
263
|
+
bath_numbers.append("")
|
264
|
+
|
265
|
+
index = str(string).find("sqm")
|
266
|
+
|
267
|
+
if index != -1:
|
268
|
+
|
269
|
+
room_sizes.append(int(property_listing[j].select(".listing")[0].text.lstrip().split("m")[0]))
|
270
|
+
|
271
|
+
else:
|
272
|
+
|
273
|
+
room_sizes.append("")
|
274
|
+
|
275
|
+
|
276
|
+
|
277
|
+
#Get property price
|
278
|
+
|
279
|
+
string = property_listing[j]
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
index = str(string).find("listing-price")
|
284
|
+
|
285
|
+
if index != -1:
|
286
|
+
|
287
|
+
property_prices.append(str(property_listing[j].select(".listing-price")[0].text.lstrip().split("\n")[0]))
|
288
|
+
|
289
|
+
else:
|
290
|
+
|
291
|
+
property_prices.append("")
|
292
|
+
|
293
|
+
|
294
|
+
|
295
|
+
#Wait for JS running
|
296
|
+
|
297
|
+
time.sleep(5)
|
298
|
+
|
299
|
+
|
300
|
+
|
301
|
+
print(property_names)
|
302
|
+
|
303
|
+
print(property_links)
|
304
|
+
|
305
|
+
print(bed_numbers)
|
306
|
+
|
307
|
+
print(bath_numbers)
|
308
|
+
|
309
|
+
print(room_sizes)
|
310
|
+
|
311
|
+
print(property_prices)
|
312
|
+
|
313
|
+
|
314
|
+
|
315
|
+
body = [property_names, property_links, bed_numbers, bath_numbers, room_sizes, property_prices]
|
316
|
+
|
317
|
+
|
318
|
+
|
319
|
+
# Output
|
320
|
+
|
321
|
+
writer.writerows(body)
|
322
|
+
|
323
|
+
|
324
|
+
|
325
|
+
# Close file
|
326
|
+
|
327
|
+
f.close()
|
328
|
+
|
329
|
+
|
330
|
+
|
331
|
+
driver.quit()
|
332
|
+
|
333
|
+
```
|