質問編集履歴
1
質問を増やしました
test
CHANGED
File without changes
|
test
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
スクレイピングをしていて下記に示すようなエラーが出てしまいました.なぜかわかりません.わかる方教えてください.よろしくお願いします.
|
1
|
+
スクレイピングで画像収集をしていて下記に示すようなエラーが出てしまいました.なぜかわかりません.わかる方教えてください.よろしくお願いします.
|
2
2
|
|
3
3
|
|
4
4
|
|
@@ -207,3 +207,243 @@
|
|
207
207
|
|
208
208
|
|
209
209
|
```
|
210
|
+
|
211
|
+
|
212
|
+
|
213
|
+
また別で
|
214
|
+
|
215
|
+
```python
|
216
|
+
|
217
|
+
import argparse
|
218
|
+
|
219
|
+
import json
|
220
|
+
|
221
|
+
import os
|
222
|
+
|
223
|
+
import urllib
|
224
|
+
|
225
|
+
|
226
|
+
|
227
|
+
from bs4 import BeautifulSoup
|
228
|
+
|
229
|
+
import requests
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
|
234
|
+
|
235
|
+
class Google(object):
|
236
|
+
|
237
|
+
def __init__(self):
|
238
|
+
|
239
|
+
self.GOOGLE_SEARCH_URL = "https://www.google.co.jp/search"
|
240
|
+
|
241
|
+
self.session = requests.session()
|
242
|
+
|
243
|
+
self.session.headers.update(
|
244
|
+
|
245
|
+
{
|
246
|
+
|
247
|
+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) \
|
248
|
+
|
249
|
+
Gecko/20100101 Firefox/10.0"
|
250
|
+
|
251
|
+
}
|
252
|
+
|
253
|
+
)
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
def search(self, keyword, maximum):
|
258
|
+
|
259
|
+
print(f"Begining searching {keyword}")
|
260
|
+
|
261
|
+
query = self.query_gen(keyword)
|
262
|
+
|
263
|
+
return self.image_search(query, maximum)
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
def query_gen(self, keyword):
|
268
|
+
|
269
|
+
# search query generator
|
270
|
+
|
271
|
+
page = 0
|
272
|
+
|
273
|
+
while True:
|
274
|
+
|
275
|
+
params = urllib.parse.urlencode(
|
276
|
+
|
277
|
+
{"q": keyword, "tbm": "isch", "ijn": str(page)}
|
278
|
+
|
279
|
+
)
|
280
|
+
|
281
|
+
|
282
|
+
|
283
|
+
yield self.GOOGLE_SEARCH_URL + "?" + params
|
284
|
+
|
285
|
+
page += 1
|
286
|
+
|
287
|
+
|
288
|
+
|
289
|
+
def image_search(self, query_gen, maximum):
|
290
|
+
|
291
|
+
results = []
|
292
|
+
|
293
|
+
total = 0
|
294
|
+
|
295
|
+
while True:
|
296
|
+
|
297
|
+
# search
|
298
|
+
|
299
|
+
html = self.session.get(next(query_gen)).text
|
300
|
+
|
301
|
+
soup = BeautifulSoup(html, "lxml")
|
302
|
+
|
303
|
+
elements = soup.select(".rg_meta.notranslate")
|
304
|
+
|
305
|
+
jsons = [json.loads(e.get_text()) for e in elements]
|
306
|
+
|
307
|
+
image_url_list = [js["ou"] for js in jsons]
|
308
|
+
|
309
|
+
|
310
|
+
|
311
|
+
# add search results
|
312
|
+
|
313
|
+
if not len(image_url_list):
|
314
|
+
|
315
|
+
print("-> No more images")
|
316
|
+
|
317
|
+
break
|
318
|
+
|
319
|
+
elif len(image_url_list) > maximum - total:
|
320
|
+
|
321
|
+
results += image_url_list[: maximum - total]
|
322
|
+
|
323
|
+
break
|
324
|
+
|
325
|
+
else:
|
326
|
+
|
327
|
+
results += image_url_list
|
328
|
+
|
329
|
+
total += len(image_url_list)
|
330
|
+
|
331
|
+
|
332
|
+
|
333
|
+
print("-> Found", str(len(results)), "images")
|
334
|
+
|
335
|
+
return results
|
336
|
+
|
337
|
+
|
338
|
+
|
339
|
+
|
340
|
+
|
341
|
+
def main():
|
342
|
+
|
343
|
+
parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
|
344
|
+
|
345
|
+
parser.add_argument("-t", "--target", help="target name", type=str, required=True)
|
346
|
+
|
347
|
+
parser.add_argument(
|
348
|
+
|
349
|
+
"-n", "--number", help="number of images", type=int, required=True
|
350
|
+
|
351
|
+
)
|
352
|
+
|
353
|
+
parser.add_argument(
|
354
|
+
|
355
|
+
"-d", "--directory", help="download location", type=str, default="./data"
|
356
|
+
|
357
|
+
)
|
358
|
+
|
359
|
+
parser.add_argument(
|
360
|
+
|
361
|
+
"-f",
|
362
|
+
|
363
|
+
"--force",
|
364
|
+
|
365
|
+
help="download overwrite existing file",
|
366
|
+
|
367
|
+
type=bool,
|
368
|
+
|
369
|
+
default=False,
|
370
|
+
|
371
|
+
)
|
372
|
+
|
373
|
+
|
374
|
+
|
375
|
+
args = parser.parse_args()
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
data_dir = args.directory
|
380
|
+
|
381
|
+
target_name = args.target
|
382
|
+
|
383
|
+
|
384
|
+
|
385
|
+
os.makedirs(data_dir, exist_ok=True)
|
386
|
+
|
387
|
+
os.makedirs(os.path.join(data_dir, target_name), exist_ok=args.force)
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
google = Google()
|
392
|
+
|
393
|
+
|
394
|
+
|
395
|
+
# search images
|
396
|
+
|
397
|
+
results = google.search(target_name, maximum=args.number)
|
398
|
+
|
399
|
+
|
400
|
+
|
401
|
+
# download
|
402
|
+
|
403
|
+
download_errors = []
|
404
|
+
|
405
|
+
for i, url in enumerate(results):
|
406
|
+
|
407
|
+
print("-> Downloading image", str(i + 1).zfill(4), end=" ")
|
408
|
+
|
409
|
+
try:
|
410
|
+
|
411
|
+
urllib.request.urlretrieve(
|
412
|
+
|
413
|
+
url,
|
414
|
+
|
415
|
+
os.path.join(*[data_dir, target_name, str(i + 1).zfill(4) + ".jpg"]),
|
416
|
+
|
417
|
+
)
|
418
|
+
|
419
|
+
print("successful")
|
420
|
+
|
421
|
+
except BaseException:
|
422
|
+
|
423
|
+
print("failed")
|
424
|
+
|
425
|
+
download_errors.append(i + 1)
|
426
|
+
|
427
|
+
continue
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
print("-" * 50)
|
432
|
+
|
433
|
+
print("Complete downloaded")
|
434
|
+
|
435
|
+
print("├─ Successful downloaded", len(results) - len(download_errors), "images")
|
436
|
+
|
437
|
+
print("└─ Failed to download", len(download_errors), "images", *download_errors)
|
438
|
+
|
439
|
+
|
440
|
+
|
441
|
+
|
442
|
+
|
443
|
+
if __name__ == "__main__":
|
444
|
+
|
445
|
+
main()
|
446
|
+
|
447
|
+
```
|
448
|
+
|
449
|
+
こちらを実行するとできるのですがなぜだかわかりますでしょうか?
|