前提・実現したいこと
食べログサイトのスクレイピングを実施したいです。
実装中に以下のエラーメッセージが発生しました。
Traceback (most recent call last): File "C:/Users/90000/Desktop/chromedriver_win32/tabelog.py", line 144, in <module> df['area'] = elems_str_area_list File "C:\Users\90000\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py", line 3607, in __setitem__ self._set_item(key, value) File "C:\Users\90000\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py", line 3779, in _set_item value = self._sanitize_column(value) File "C:\Users\90000\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\frame.py", line 4504, in _sanitize_column com.require_length_match(value, self.index) File "C:\Users\90000\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\common.py", line 531, in require_length_match raise ValueError( ValueError: Length of values (0) does not match length of index (20)
該当のソースコード
import pandas as pd import numpy as np from bs4 import BeautifulSoup import re import requests from selenium import webdriver browser = webdriver.Chrome() def get_info_str(url): browser.get(url) elems_str_info = browser.find_elements_by_class_name('rstinfo-table') elem_str_info = elems_str_info[0].text.split('\n') elem_df = pd.DataFrame() elem_df['elem'] = elem_str_info elem_df['number'] = list(range(0,len(elem_df))) # 住所取得 try: elem_str_address = browser.find_elements_by_class_name('rstinfo-table__address') elem_str_address = elem_str_address[0].text except: elem_str_adress = "-" # 席数を取得 try: chair_count = [x for x in elem_str_info if x.endswith('席')] chair_count = chair_count[0] except: chair_count = "-" # 定休日を取得 try: i = elem_df.query('elem =="定休日"')['number'].max()+1 regular_holiday = list(elem_df.query('number == {}'.format(i))['elem'])[0] except: regular_holiday = "-" # オープン日を取得 try: i = elem_df.query('elem =="オープン日"')['number'].max()+1 open_date = list(elem_df.query('number == {}'.format(i))['elem'])[0] except: open_date = "-" # 電話番号を取得 try: i = elem_df.query('elem =="電話番号"')['number'].max()+1 call_number = list(elem_df.query('number == {}'.format(i))['elem'])[0] except: call_number = "-" str_info = list() str_info.append(elem_str_address) str_info.append(chair_count) str_info.append(regular_holiday) str_info.append(open_date) str_info.append(call_number) return str_info browser.get('https://tabelog.com/tokyo/') elems_tabelog = browser.find_elements_by_class_name('list-rst__header') str_name_list = list() for x in elems_tabelog: elem_str = x.text.split() print(elem_str) # ジャンルを取得 elems_genre_list = list() for x in elems_tabelog: elem_genre = x.text.split('/')[1] elems_genre_list.append(elem_genre) print(elems_genre_list) # 店名を取得 elems_str_name_list = list() for x in elems_tabelog: elem_str_name = x.text.split('/')[0] elem_str_name = elem_str_name.split() #elem_str_name = elem_str_name[0:len(elem_str_name)-1] elem_str_name = elem_str_name[0:(len(elem_str_name)-2)] elem_str_name = ' '.join(elem_str_name) elems_str_name_list.append(elem_str_name) # 評価を取得 elems_tabelog_score = browser.find_elements_by_class_name("list-rst__rate") tabelog_score_list = list() for x in elems_tabelog_score: tabelog_score = x.text.split()[0] tabelog_score_list.append(tabelog_score) # エリアを取得 elems_str_area_list = list() for x in elems_tabelog: elem_str_area = x.text.split('/')[0] elem_str_area = elem_str_area.split()[-2:] elem_str_area = elem_str_area[0] + elem_str_area[1] #elem_str_name = elem_str_name.split('') #elem_str_name = elem_str_name[0:len(elem_str_name)-1] elems_str_area_list.append(elem_str_area) # 昼と夜の予算を取得 elems_tabelog_plan = browser.find_elements_by_class_name("list-rst__budget") ### 昼を取得 elem_tabelog_lunch_list = list() for x in elems_tabelog_plan: tabelog_lunch = x.text.split()[3] elem_tabelog_lunch_list.append(tabelog_lunch) ### 夜を取得 elem_tabelog_dinner_list = list() for x in elems_tabelog_plan: tabelog_dinner = x.text.split()[3] elem_tabelog_dinner_list.append(tabelog_dinner) # ホームページ情報を取得 r = requests.get('https://tabelog.com/tokyo/') #requestsを使って、webから取得 soup = BeautifulSoup(r.text, 'lxml') #要素を抽出 page_list = list() for a in soup.find_all(class_='list-rst__rst-name-target cpy-rst-name'): page_list.append(a.get('href')) df = pd.DataFrame() df['genre'] = elems_genre_list df['str_name'] = elems_str_name_list df['str_score'] = tabelog_score_list df['area'] = elems_str_area_list df['lunch_plan'] = elem_tabelog_lunch_list df['dinner_plan'] = elem_tabelog_dinner_list df['str_url'] = page_list str_list = list() for i in range(len(df)): page_url = df['str_url'][i] print(page_url) list_x = get_info_str(page_url) list_x.append(page_url) str_list.append(list_x) str_df = pd.DataFrame(str_list) str_df.columns = ['str_adress', 'chair_count', 'holiday', 'open_date', 'callnumber', 'str_url'] str_1 = pd.merge(df, str_df, how='left')
補足情報(FW/ツールのバージョンなど)
pyhthon初心者です。宜しくお願いします。
ここにより詳細な情報を記載してください。
あなたの回答
tips
プレビュー