質問編集履歴

1

質問を増やしました

2019/06/17 07:28

投稿

Mr_K
Mr_K

スコア28

test CHANGED
File without changes
test CHANGED
@@ -1,4 +1,4 @@
1
- スクレイピングをしていて下記に示すようなエラーが出てしまいました.なぜかわかりません.わかる方教えてください.よろしくお願いします.
1
+ スクレイピングで画像収集をしていて下記に示すようなエラーが出てしまいました.なぜかわかりません.わかる方教えてください.よろしくお願いします.
2
2
 
3
3
 
4
4
 
@@ -207,3 +207,243 @@
207
207
 
208
208
 
209
209
  ```
210
+
211
+
212
+
213
+ また別で
214
+
215
+ ```python
216
+
217
+ import argparse
218
+
219
+ import json
220
+
221
+ import os
222
+
223
+ import urllib
224
+
225
+
226
+
227
+ from bs4 import BeautifulSoup
228
+
229
+ import requests
230
+
231
+
232
+
233
+
234
+
235
+ class Google(object):
236
+
237
+ def __init__(self):
238
+
239
+ self.GOOGLE_SEARCH_URL = "https://www.google.co.jp/search"
240
+
241
+ self.session = requests.session()
242
+
243
+ self.session.headers.update(
244
+
245
+ {
246
+
247
+ "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:10.0) \
248
+
249
+ Gecko/20100101 Firefox/10.0"
250
+
251
+ }
252
+
253
+ )
254
+
255
+
256
+
257
+ def search(self, keyword, maximum):
258
+
259
+ print(f"Begining searching {keyword}")
260
+
261
+ query = self.query_gen(keyword)
262
+
263
+ return self.image_search(query, maximum)
264
+
265
+
266
+
267
+ def query_gen(self, keyword):
268
+
269
+ # search query generator
270
+
271
+ page = 0
272
+
273
+ while True:
274
+
275
+ params = urllib.parse.urlencode(
276
+
277
+ {"q": keyword, "tbm": "isch", "ijn": str(page)}
278
+
279
+ )
280
+
281
+
282
+
283
+ yield self.GOOGLE_SEARCH_URL + "?" + params
284
+
285
+ page += 1
286
+
287
+
288
+
289
+ def image_search(self, query_gen, maximum):
290
+
291
+ results = []
292
+
293
+ total = 0
294
+
295
+ while True:
296
+
297
+ # search
298
+
299
+ html = self.session.get(next(query_gen)).text
300
+
301
+ soup = BeautifulSoup(html, "lxml")
302
+
303
+ elements = soup.select(".rg_meta.notranslate")
304
+
305
+ jsons = [json.loads(e.get_text()) for e in elements]
306
+
307
+ image_url_list = [js["ou"] for js in jsons]
308
+
309
+
310
+
311
+ # add search results
312
+
313
+ if not len(image_url_list):
314
+
315
+ print("-> No more images")
316
+
317
+ break
318
+
319
+ elif len(image_url_list) > maximum - total:
320
+
321
+ results += image_url_list[: maximum - total]
322
+
323
+ break
324
+
325
+ else:
326
+
327
+ results += image_url_list
328
+
329
+ total += len(image_url_list)
330
+
331
+
332
+
333
+ print("-> Found", str(len(results)), "images")
334
+
335
+ return results
336
+
337
+
338
+
339
+
340
+
341
+ def main():
342
+
343
+ parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)
344
+
345
+ parser.add_argument("-t", "--target", help="target name", type=str, required=True)
346
+
347
+ parser.add_argument(
348
+
349
+ "-n", "--number", help="number of images", type=int, required=True
350
+
351
+ )
352
+
353
+ parser.add_argument(
354
+
355
+ "-d", "--directory", help="download location", type=str, default="./data"
356
+
357
+ )
358
+
359
+ parser.add_argument(
360
+
361
+ "-f",
362
+
363
+ "--force",
364
+
365
+ help="download overwrite existing file",
366
+
367
+ type=bool,
368
+
369
+ default=False,
370
+
371
+ )
372
+
373
+
374
+
375
+ args = parser.parse_args()
376
+
377
+
378
+
379
+ data_dir = args.directory
380
+
381
+ target_name = args.target
382
+
383
+
384
+
385
+ os.makedirs(data_dir, exist_ok=True)
386
+
387
+ os.makedirs(os.path.join(data_dir, target_name), exist_ok=args.force)
388
+
389
+
390
+
391
+ google = Google()
392
+
393
+
394
+
395
+ # search images
396
+
397
+ results = google.search(target_name, maximum=args.number)
398
+
399
+
400
+
401
+ # download
402
+
403
+ download_errors = []
404
+
405
+ for i, url in enumerate(results):
406
+
407
+ print("-> Downloading image", str(i + 1).zfill(4), end=" ")
408
+
409
+ try:
410
+
411
+ urllib.request.urlretrieve(
412
+
413
+ url,
414
+
415
+ os.path.join(*[data_dir, target_name, str(i + 1).zfill(4) + ".jpg"]),
416
+
417
+ )
418
+
419
+ print("successful")
420
+
421
+ except BaseException:
422
+
423
+ print("failed")
424
+
425
+ download_errors.append(i + 1)
426
+
427
+ continue
428
+
429
+
430
+
431
+ print("-" * 50)
432
+
433
+ print("Complete downloaded")
434
+
435
+ print("├─ Successful downloaded", len(results) - len(download_errors), "images")
436
+
437
+ print("└─ Failed to download", len(download_errors), "images", *download_errors)
438
+
439
+
440
+
441
+
442
+
443
+ if __name__ == "__main__":
444
+
445
+ main()
446
+
447
+ ```
448
+
449
+ こちらを実行するとできるのですがなぜだかわかりますでしょうか?