#目的
ValueError: Length of values does not match length of index のエラーを解決したい。
背景
とある求人サイトをスクレイピングし、pandasをつかってCSVファイルに格納する過程で、取得項目が不規則でCSV格納するとlengthが合わなくなりエラーになる。
該当コード
from selenium import webdriver from time import sleep import pandas as pd import re browser = webdriver.Chrome() browser.get('https://re-katsu.jp/career/search/kanto/') st_urls = [] for page in range( 1,36): url = 'https://re-katsu.jp/career/search/kanto/?pagCnt={}'.format(page) browser.get(url) sleep(1) browser.find_element_by_css_selector details = browser.find_elements_by_class_name('headers') for detail in details: _url = detail.find_element_by_tag_name("h3 a") _url = _url.get_attribute("href") list_urls.append(_url) industry_mix = [] up_mix = [] c_mix = [] occ_mix = [] f_salary_mix = [] area_mix = [] staff_mix = [] c_address_mix = [] hp_mix = [] info_mix = [] for url in list_urls: url = browser.get(url) sleep(1) #会社名 c_names = browser.find_element_by_class_name("headers").find_elements_by_tag_name("h2") for c_name in c_names: _mix = c_name.text c_mix.append(_mix) #掲載終了予定日 updates = browser.find_element_by_class_name("article-data").find_elements_by_id('ctl00_ContentPlaceHolder1_lblPublishedLastdate') for update in updates: _mix = update.text up_mix.append(_mix) #業種 industrys = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblIndustryIcon') for industry in industrys: _mix = industry.text industry_mix.append(_mix) #職種 occs = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblServIcon') for occ in occs: _mix = occ.text occ_mix.append(_mix) #初年度給与 f_salarys = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblIncomeIcon') for f_salary in f_salarys: _mix = f_salary.text f_salary_mix.append(_mix) #勤務地 areas = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblWorkLocateIcon') for area in areas: _mix = area.text area_mix.append(_mix) #従業員数 staffs = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_trEmployeesCount') if not len(staffs) == len(c_names): for staff in staffs: _mix = staff.text staff_mix.append(_mix) #本社所在地 c_addresses = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblHeadofficelocation') for c_address in c_addresses: _mix = c_address.text c_address_mix.append(_mix) #HP hps = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblLink_Body1') for hp in hps: _mix = hp.text hp_mix.append(_mix) #連絡先 pattern = r'[(]{0,1}[0-9]{2,4}[)\-(]{0,1}[0-9]{2,4}[)\-]{0,1}[0-9]{3,4}' infos = browser.find_elements_by_id('ctl00_ContentPlaceHolder1_lblContactInfo') for info in infos: _mix = info.text #regex = re.compile(pattern, flags=0) #mo = regex.search(_mix) #mo.group() info_mix.append(_mix) df = pd.DataFrame() df['会社名'] = c_mix df['掲載終了予定日'] = up_mix df['業種'] = industry_mix df['職種'] = occ_mix df['初年度年収'] = f_salary_mix df['勤務地'] = area_mix df['従業員数'] = staff_mix df['本社所在地'] = c_address_mix df['HP'] = hp_mix df['連絡先'] = info_mix
エラーコード
ValueError Traceback (most recent call last) <ipython-input-363-7e5d40ae8e25> in <module> 5 df['初年度年収'] = f_salary_mix 6 df['勤務地'] = area_mix ----> 7 df['従業員数'] = staff_mix 8 df['本社所在地'] = c_address_mix 9 df['HP'] = hp_mix ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __setitem__(self, key, value) 3470 else: 3471 # set column -> 3472 self._set_item(key, value) 3473 3474 def _setitem_slice(self, key, value): ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _set_item(self, key, value) 3547 3548 self._ensure_valid_index(value) -> 3549 value = self._sanitize_column(key, value) 3550 NDFrame._set_item(self, key, value) 3551 ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast) 3732 3733 # turn me into an ndarray -> 3734 value = sanitize_index(value, self.index, copy=False) 3735 if not isinstance(value, (np.ndarray, Index)): 3736 if isinstance(value, list) and len(value) > 0: ~/opt/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_index(data, index, copy) 610 611 if len(data) != len(index): --> 612 raise ValueError("Length of values does not match length of index") 613 614 if isinstance(data, ABCIndexClass) and not copy: ValueError: Length of values does not match length of index
初心者のため、ご教示お願いいたします!!
バッドをするには、ログインかつ
こちらの条件を満たす必要があります。
2020/04/17 02:02
退会済みユーザー
2020/04/17 03:22
退会済みユーザー
2020/04/17 03:27