質問編集履歴
3
補足情報を書いた。
test
CHANGED
File without changes
|
test
CHANGED
@@ -256,274 +256,18 @@
|
|
256
256
|
|
257
257
|
##追記
|
258
258
|
|
259
|
-
適当にー
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
##発生している問題・エラーメッセージ
|
276
|
-
|
277
|
-
Traceback (most recent call last):
|
278
|
-
|
279
|
-
File "d:/Pythonからspreadsheetsへ/venv1/lancerswork/scraping.py", line 111, in <module>
|
280
|
-
|
281
|
-
set_with_dataframe(sh, dict_df)
|
282
|
-
|
283
|
-
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\gspread_dataframe.py", line 241, in set_with_dataframe
|
284
|
-
|
285
|
-
_cellrepr(cell_value, allow_formulas))
|
286
|
-
|
287
|
-
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\gspread_dataframe.py", line 49, in _cellrepr
|
288
|
-
|
289
|
-
if pd.isnull(value) is True:
|
290
|
-
|
291
|
-
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\missing.py", line 126, in isna
|
292
|
-
|
293
|
-
return _isna(obj)
|
294
|
-
|
295
|
-
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\missing.py", line 141, in _isna_new
|
296
|
-
|
297
|
-
elif isinstance(
|
298
|
-
|
299
|
-
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\generic.py", line 12, in _check
|
300
|
-
|
301
|
-
return getattr(inst, attr, "_typ") in comp
|
302
|
-
|
303
|
-
TypeError: 'in <string>' requires string as left operand, not NoneType
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
##該当のソースコード
|
308
|
-
|
309
|
-
```python
|
310
|
-
|
311
|
-
import requests
|
312
|
-
|
313
|
-
from bs4 import BeautifulSoup as bs
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
from selenium import webdriver
|
318
|
-
|
319
|
-
import time
|
320
|
-
|
321
|
-
from tqdm import tqdm
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
import json
|
326
|
-
|
327
|
-
import gspread
|
328
|
-
|
329
|
-
from googleapiclient import discovery
|
330
|
-
|
331
|
-
from oauth2client.service_account import ServiceAccountCredentials
|
332
|
-
|
333
|
-
from gspread_dataframe import get_as_dataframe
|
334
|
-
|
335
|
-
from gspread_dataframe import set_with_dataframe
|
336
|
-
|
337
|
-
import pandas as pd
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
office_names = []
|
348
|
-
|
349
|
-
address_lists = []
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
profile_names = []
|
354
|
-
|
355
|
-
office_names2 = []
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
for page in range(1,3):
|
360
|
-
|
361
|
-
url = 'https://www.zeiri4.com/firm/search/?FirmSearchForm%5BPrefecture_id%5D=13&FirmSearchForm%5BIndustry_id%5D%5B0%5D=1&FirmSearchForm%5BIndustry_id%5D%5B1%5D=4&page={}'.format(page)
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
response = requests.get(url, headers=headers)
|
366
|
-
|
367
|
-
soup = bs(response.content, 'html.parser')
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
for office in soup.find_all('h2'):
|
372
|
-
|
373
|
-
offices = office.text.strip('\n')
|
374
|
-
|
375
|
-
office_names.append(offices)
|
376
|
-
|
377
|
-
time.sleep(10)
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
for address in soup.find_all('dl', class_='b-firmSearchPanel__datalist'):
|
382
|
-
|
383
|
-
address_ = address.find('dd')
|
384
|
-
|
385
|
-
address_lists.append(address_)
|
386
|
-
|
387
|
-
time.sleep(10)
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
url1 = 'https://www.bengo4.com/tokyo/f_12/?page={}'.format(page)
|
392
|
-
|
393
|
-
response1 = requests.get(url1, headers=headers)
|
394
|
-
|
395
|
-
soup1 = bs(response1.content, 'html.parser')
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
for name in soup1.find_all(class_='profile__name'):
|
400
|
-
|
401
|
-
name_ = name.text
|
402
|
-
|
403
|
-
profile_names.append(name_)
|
404
|
-
|
405
|
-
time.sleep(10)
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
for office2 in soup1.find_all('p', class_='office'):
|
410
|
-
|
411
|
-
office2_ = office2.text
|
412
|
-
|
413
|
-
office_names2.append(office2_)
|
414
|
-
|
415
|
-
time.sleep(10)
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
browser = webdriver.Chrome(r'D:\Pythonからspreadsheetsへ\venv1\lancerswork\chromedriver.exe')
|
422
|
-
|
423
|
-
browser.implicitly_wait(10)
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
address_lists2 = []
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
for page in range(1,3):
|
432
|
-
|
433
|
-
url2 = 'https://www.bengo4.com/tokyo/f_12/?page={}'.format(page)
|
434
|
-
|
435
|
-
browser.get(url2)
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
for address2 in browser.find_elements_by_class_name('address'):
|
440
|
-
|
441
|
-
address2_ = address2.text
|
442
|
-
|
443
|
-
address_lists2.append(address2_)
|
444
|
-
|
445
|
-
time.sleep(10)
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
|
452
|
-
|
453
|
-
credentials = ServiceAccountCredentials.from_json_keyfile_name('伏せておきます', scope)
|
454
|
-
|
455
|
-
gc = gspread.authorize(credentials)
|
456
|
-
|
457
|
-
SPREADSHEET_KEY = '1qPbI3Rf995Z53sU3sK60SlgMwtFkO13w4P-wBQlTcv8'
|
458
|
-
|
459
|
-
worksheet = gc.open_by_key(SPREADSHEET_KEY)
|
460
|
-
|
461
|
-
wb = worksheet.sheet1
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
mydict = {
|
466
|
-
|
467
|
-
'事務所名':office_names,
|
468
|
-
|
469
|
-
'住所':address_lists,
|
470
|
-
|
471
|
-
'':'',
|
472
|
-
|
473
|
-
'弁護士名':profile_names,
|
474
|
-
|
475
|
-
'事務所名1':office_names2,
|
476
|
-
|
477
|
-
'住所1':address_lists2,
|
259
|
+
下のように適当に作った辞書型のデータで実行してみるとうまくいきますが、スクレイピングのデータで実行するとエラーになります。
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
mydict={
|
264
|
+
|
265
|
+
'yoo':2,
|
266
|
+
|
267
|
+
'poo':3,
|
268
|
+
|
269
|
+
'foo':4,
|
270
|
+
|
271
|
+
'too':5
|
478
272
|
|
479
273
|
}
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
dict_df = pd.DataFrame({key:pd.Series(value) for key, value in mydict.items()})
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
sh = gc.open_by_key('1qPbI3Rf995Z53sU3sK60SlgMwtFkO13w4P-wBQlTcv8').worksheet('シート1')
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
set_with_dataframe(sh, dict_df)
|
492
|
-
|
493
|
-
```
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
##試したこと
|
498
|
-
|
499
|
-
以下のスクレイピングをする最初の部分だけを残し、他の部分をコメントアウトして実行したときはspread sheetまで書き込むことができました。
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
for office in soup.find_all('h2'):
|
504
|
-
|
505
|
-
offices = office.text.strip('\n')
|
506
|
-
|
507
|
-
office_names.append(offices)
|
508
|
-
|
509
|
-
time.sleep(10)
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
##追記
|
514
|
-
|
515
|
-
下のように適当に作った辞書型のデータで実行してみるとうまくいきますが、スクレイピングのデータで実行するとエラーになります。
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
mydict={
|
520
|
-
|
521
|
-
'yoo':2,
|
522
|
-
|
523
|
-
'poo':3,
|
524
|
-
|
525
|
-
'foo':4,
|
526
|
-
|
527
|
-
'too':5
|
528
|
-
|
529
|
-
}
|
2
補足情報を書いた。
test
CHANGED
File without changes
|
test
CHANGED
@@ -251,3 +251,279 @@
|
|
251
251
|
office_names.append(offices)
|
252
252
|
|
253
253
|
time.sleep(10)
|
254
|
+
|
255
|
+
|
256
|
+
|
257
|
+
##追記
|
258
|
+
|
259
|
+
適当にードを実行すると以下のようなエラー文が出ました。
|
260
|
+
|
261
|
+
|
262
|
+
|
263
|
+
エラー文の最後に、「TypeError: 'in <string>' requires string as left operand, not NoneType」と書いてあるので、for文の所でしょうか。
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
どこを、どう直せばよいかわからず質問しました。
|
268
|
+
|
269
|
+
教えて下さい。
|
270
|
+
|
271
|
+
よろしくお願いします。
|
272
|
+
|
273
|
+
|
274
|
+
|
275
|
+
##発生している問題・エラーメッセージ
|
276
|
+
|
277
|
+
Traceback (most recent call last):
|
278
|
+
|
279
|
+
File "d:/Pythonからspreadsheetsへ/venv1/lancerswork/scraping.py", line 111, in <module>
|
280
|
+
|
281
|
+
set_with_dataframe(sh, dict_df)
|
282
|
+
|
283
|
+
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\gspread_dataframe.py", line 241, in set_with_dataframe
|
284
|
+
|
285
|
+
_cellrepr(cell_value, allow_formulas))
|
286
|
+
|
287
|
+
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\gspread_dataframe.py", line 49, in _cellrepr
|
288
|
+
|
289
|
+
if pd.isnull(value) is True:
|
290
|
+
|
291
|
+
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\missing.py", line 126, in isna
|
292
|
+
|
293
|
+
return _isna(obj)
|
294
|
+
|
295
|
+
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\missing.py", line 141, in _isna_new
|
296
|
+
|
297
|
+
elif isinstance(
|
298
|
+
|
299
|
+
File "D:\Pythonからspreadsheetsへ\venv1\lib\site-packages\pandas\core\dtypes\generic.py", line 12, in _check
|
300
|
+
|
301
|
+
return getattr(inst, attr, "_typ") in comp
|
302
|
+
|
303
|
+
TypeError: 'in <string>' requires string as left operand, not NoneType
|
304
|
+
|
305
|
+
|
306
|
+
|
307
|
+
##該当のソースコード
|
308
|
+
|
309
|
+
```python
|
310
|
+
|
311
|
+
import requests
|
312
|
+
|
313
|
+
from bs4 import BeautifulSoup as bs
|
314
|
+
|
315
|
+
|
316
|
+
|
317
|
+
from selenium import webdriver
|
318
|
+
|
319
|
+
import time
|
320
|
+
|
321
|
+
from tqdm import tqdm
|
322
|
+
|
323
|
+
|
324
|
+
|
325
|
+
import json
|
326
|
+
|
327
|
+
import gspread
|
328
|
+
|
329
|
+
from googleapiclient import discovery
|
330
|
+
|
331
|
+
from oauth2client.service_account import ServiceAccountCredentials
|
332
|
+
|
333
|
+
from gspread_dataframe import get_as_dataframe
|
334
|
+
|
335
|
+
from gspread_dataframe import set_with_dataframe
|
336
|
+
|
337
|
+
import pandas as pd
|
338
|
+
|
339
|
+
|
340
|
+
|
341
|
+
|
342
|
+
|
343
|
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36"}
|
344
|
+
|
345
|
+
|
346
|
+
|
347
|
+
office_names = []
|
348
|
+
|
349
|
+
address_lists = []
|
350
|
+
|
351
|
+
|
352
|
+
|
353
|
+
profile_names = []
|
354
|
+
|
355
|
+
office_names2 = []
|
356
|
+
|
357
|
+
|
358
|
+
|
359
|
+
for page in range(1,3):
|
360
|
+
|
361
|
+
url = 'https://www.zeiri4.com/firm/search/?FirmSearchForm%5BPrefecture_id%5D=13&FirmSearchForm%5BIndustry_id%5D%5B0%5D=1&FirmSearchForm%5BIndustry_id%5D%5B1%5D=4&page={}'.format(page)
|
362
|
+
|
363
|
+
|
364
|
+
|
365
|
+
response = requests.get(url, headers=headers)
|
366
|
+
|
367
|
+
soup = bs(response.content, 'html.parser')
|
368
|
+
|
369
|
+
|
370
|
+
|
371
|
+
for office in soup.find_all('h2'):
|
372
|
+
|
373
|
+
offices = office.text.strip('\n')
|
374
|
+
|
375
|
+
office_names.append(offices)
|
376
|
+
|
377
|
+
time.sleep(10)
|
378
|
+
|
379
|
+
|
380
|
+
|
381
|
+
for address in soup.find_all('dl', class_='b-firmSearchPanel__datalist'):
|
382
|
+
|
383
|
+
address_ = address.find('dd')
|
384
|
+
|
385
|
+
address_lists.append(address_)
|
386
|
+
|
387
|
+
time.sleep(10)
|
388
|
+
|
389
|
+
|
390
|
+
|
391
|
+
url1 = 'https://www.bengo4.com/tokyo/f_12/?page={}'.format(page)
|
392
|
+
|
393
|
+
response1 = requests.get(url1, headers=headers)
|
394
|
+
|
395
|
+
soup1 = bs(response1.content, 'html.parser')
|
396
|
+
|
397
|
+
|
398
|
+
|
399
|
+
for name in soup1.find_all(class_='profile__name'):
|
400
|
+
|
401
|
+
name_ = name.text
|
402
|
+
|
403
|
+
profile_names.append(name_)
|
404
|
+
|
405
|
+
time.sleep(10)
|
406
|
+
|
407
|
+
|
408
|
+
|
409
|
+
for office2 in soup1.find_all('p', class_='office'):
|
410
|
+
|
411
|
+
office2_ = office2.text
|
412
|
+
|
413
|
+
office_names2.append(office2_)
|
414
|
+
|
415
|
+
time.sleep(10)
|
416
|
+
|
417
|
+
|
418
|
+
|
419
|
+
|
420
|
+
|
421
|
+
browser = webdriver.Chrome(r'D:\Pythonからspreadsheetsへ\venv1\lancerswork\chromedriver.exe')
|
422
|
+
|
423
|
+
browser.implicitly_wait(10)
|
424
|
+
|
425
|
+
|
426
|
+
|
427
|
+
address_lists2 = []
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
for page in range(1,3):
|
432
|
+
|
433
|
+
url2 = 'https://www.bengo4.com/tokyo/f_12/?page={}'.format(page)
|
434
|
+
|
435
|
+
browser.get(url2)
|
436
|
+
|
437
|
+
|
438
|
+
|
439
|
+
for address2 in browser.find_elements_by_class_name('address'):
|
440
|
+
|
441
|
+
address2_ = address2.text
|
442
|
+
|
443
|
+
address_lists2.append(address2_)
|
444
|
+
|
445
|
+
time.sleep(10)
|
446
|
+
|
447
|
+
|
448
|
+
|
449
|
+
|
450
|
+
|
451
|
+
scope = ['https://spreadsheets.google.com/feeds','https://www.googleapis.com/auth/drive']
|
452
|
+
|
453
|
+
credentials = ServiceAccountCredentials.from_json_keyfile_name('伏せておきます', scope)
|
454
|
+
|
455
|
+
gc = gspread.authorize(credentials)
|
456
|
+
|
457
|
+
SPREADSHEET_KEY = '1qPbI3Rf995Z53sU3sK60SlgMwtFkO13w4P-wBQlTcv8'
|
458
|
+
|
459
|
+
worksheet = gc.open_by_key(SPREADSHEET_KEY)
|
460
|
+
|
461
|
+
wb = worksheet.sheet1
|
462
|
+
|
463
|
+
|
464
|
+
|
465
|
+
mydict = {
|
466
|
+
|
467
|
+
'事務所名':office_names,
|
468
|
+
|
469
|
+
'住所':address_lists,
|
470
|
+
|
471
|
+
'':'',
|
472
|
+
|
473
|
+
'弁護士名':profile_names,
|
474
|
+
|
475
|
+
'事務所名1':office_names2,
|
476
|
+
|
477
|
+
'住所1':address_lists2,
|
478
|
+
|
479
|
+
}
|
480
|
+
|
481
|
+
|
482
|
+
|
483
|
+
dict_df = pd.DataFrame({key:pd.Series(value) for key, value in mydict.items()})
|
484
|
+
|
485
|
+
|
486
|
+
|
487
|
+
sh = gc.open_by_key('1qPbI3Rf995Z53sU3sK60SlgMwtFkO13w4P-wBQlTcv8').worksheet('シート1')
|
488
|
+
|
489
|
+
|
490
|
+
|
491
|
+
set_with_dataframe(sh, dict_df)
|
492
|
+
|
493
|
+
```
|
494
|
+
|
495
|
+
|
496
|
+
|
497
|
+
##試したこと
|
498
|
+
|
499
|
+
以下のスクレイピングをする最初の部分だけを残し、他の部分をコメントアウトして実行したときはspread sheetまで書き込むことができました。
|
500
|
+
|
501
|
+
|
502
|
+
|
503
|
+
for office in soup.find_all('h2'):
|
504
|
+
|
505
|
+
offices = office.text.strip('\n')
|
506
|
+
|
507
|
+
office_names.append(offices)
|
508
|
+
|
509
|
+
time.sleep(10)
|
510
|
+
|
511
|
+
|
512
|
+
|
513
|
+
##追記
|
514
|
+
|
515
|
+
下のように適当に作った辞書型のデータで実行してみるとうまくいきますが、スクレイピングのデータで実行するとエラーになります。
|
516
|
+
|
517
|
+
|
518
|
+
|
519
|
+
mydict={
|
520
|
+
|
521
|
+
'yoo':2,
|
522
|
+
|
523
|
+
'poo':3,
|
524
|
+
|
525
|
+
'foo':4,
|
526
|
+
|
527
|
+
'too':5
|
528
|
+
|
529
|
+
}
|
1
補足情報を書いた。
test
CHANGED
File without changes
|
test
CHANGED
@@ -235,3 +235,19 @@
|
|
235
235
|
set_with_dataframe(sh, dict_df)
|
236
236
|
|
237
237
|
```
|
238
|
+
|
239
|
+
|
240
|
+
|
241
|
+
##試したこと
|
242
|
+
|
243
|
+
以下のスクレイピングをする最初の部分だけを残し、他の部分をコメントアウトして実行したときはspread sheetまで書き込むことができました。
|
244
|
+
|
245
|
+
|
246
|
+
|
247
|
+
for office in soup.find_all('h2'):
|
248
|
+
|
249
|
+
offices = office.text.strip('\n')
|
250
|
+
|
251
|
+
office_names.append(offices)
|
252
|
+
|
253
|
+
time.sleep(10)
|