前提
Python でスクレイピングを実装中にエラーが発生しました。
じゃらん のランキングから口コミを取得しようとしています。
こちらのコードをそのまま参考にして実行しようとしています。
https://www.youtube.com/watch?v=TVjj2YH6_5Q
https://colab.research.google.com/drive/116e8gh8_kyjFStcxalUi2JYMS-ub0C3M?usp=sharing#scrollTo=qz8Khqwcz-Ub
下記の該当のソースコードまで実行した際に、エラーが吐き出されました。
どの部分がいけないのかも分からず、教えていただけると嬉しいです。
発生している問題・エラーメッセージ
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Input In [4], in <cell line: 10>() 6 output_path = f'./data/jalan_{area}_{stay_type}_{rank_type}_{max_num}.csv' 8 jsc = JalanReviewScraping(area, stay_type, rank_type, 9 verbose=True, max_num=max_num) ---> 10 df = jsc.scaping(output_path) Input In [3], in JalanReviewScraping.scaping(self, output_path) 291 driver = webdriver.Chrome('chromedriver',options=self.options) 292 driver.implicitly_wait(10) --> 293 df = self.sample_kuchikomi(driver, self.rank_type, self.area, self.stay_type) 294 driver.close() 295 if output_path: Input In [3], in JalanReviewScraping.sample_kuchikomi(self, driver, rank_type, area, stay_type) 95 kuchikomi['地域'] = area 96 kuchikomi['宿タイプ'] = stay_type ---> 98 kuchikomi = self.get_kuchikomi_oldest(_last_itr, kuchikomi, count, kuchikomi_elm) 99 if not kuchikomi: 100 continue Input In [3], in JalanReviewScraping.get_kuchikomi_oldest(self, _last_itr, kuchikomi, count, kuchikomi_elm) 217 age = None 218 situation = None --> 220 sex = re.sub(' ', '', sex) 221 kuchikomi['性別'] = sex 222 kuchikomi['年代'] = age File ~/opt/anaconda3/envs/resort_1/lib/python3.10/re.py:209, in sub(pattern, repl, string, count, flags) 202 def sub(pattern, repl, string, count=0, flags=0): 203 """Return the string obtained by replacing the leftmost 204 non-overlapping occurrences of the pattern in string by the 205 replacement repl. repl can be either a string or a callable; 206 if a string, backslash escapes in it are processed. If it is 207 a callable, it's passed the Match object and must return 208 a replacement string to be used.""" --> 209 return _compile(pattern, flags).sub(repl, string, count) TypeError: expected string or bytes-like object
該当のソースコード
area = '関東・甲信越' stay_type = 'ペンション・民宿・ロッジ' rank_type = '泊まって良かった宿' max_num = 200 # 各宿最大何件ずつとってくるか output_path = f'./data/jalan_{area}_{stay_type}_{rank_type}_{max_num}.csv' jsc = JalanReviewScraping(area, stay_type, rank_type, verbose=True, max_num=max_num) df = jsc.scaping(output_path) ``
output_pathの中身は何になっていますか。
前にこちらのクラスを定義しています
# スクレイピング用のコード
class JalanConfig:
"""
設定が書いてあるクラス
"""
area_num= {
0: '北海道',
1: '東北',
2: '関東・甲信越',
3: '東海',
4: '近畿・北陸',
5: '中国・四国',
6: '九州',
7: '沖縄'
}
stay_type_config = {
'旅館' : 'total1',
'ホテル': 'total2',
'ペンション・民宿・ロッジ': 'total3',
}
rank_type_config = {
'泊まって良かった宿': 'first',
'お風呂の良かった宿': 'second',
'夕食の良かった宿': 'third',
'お部屋の良かった宿': 'last'
}
class JalanReviewScraping:
"""
じゃらんの温泉口コミをスクレイピングするクラス
"""
def __init__(self, area:str, stay_type:str, rank_type:str,
max_num:int, verbose:bool=False)->None:
self.hp = 'https://www.jalan.net/jalan/doc/ranking'
self.driver_path = 'chromedriver'
self.area = area
self.stay_type = stay_type
self.rank_type = rank_type
self.max_num = max_num
self.verose = verbose
def sample_kuchikomi(self, driver, rank_type, area, stay_type):
"""
口コミを取得する
"""
driver.get(self.hp)
print("start: ", driver.title)
# 指定したランキングページと地域の一覧
_rank_kw = JalanConfig.rank_type_config[rank_type]
elems = driver.find_elements(by=By.CSS_SELECTOR,
value=f'#rank_top > div > div > div.rankingsContainer.{_rank_kw}Container > ul > li > a')
_area_url = {}
for i, elem in enumerate(elems):
_area = elem.text
if not _area:
_area = JalanConfig.area_num[i]
_url = elem.get_attribute('href')
_area_url[_area] = _url
# 指定した地域のページへ
driver.get(_area_url[area])
kushikomi_list = []
# 口コミ1位〜10位までを順にクローリングし、max_num件数だけ口コミを取得していく
_stay_type_kw = JalanConfig.stay_type_config[stay_type]
items = driver.find_elements(by=By.CSS_SELECTOR, value=f'#{_stay_type_kw} > div.item')
for i, item in enumerate(items, 1):
# 宿名
name = item.find_element(by=By.CLASS_NAME, value='title').text.replace('\u3000', ' ')
rank = i
print(rank_type, ' ', area, f' ランキング{i}位', name)
# 該当の宿ページを取得
try:
hotel_url = item.find_elements(by=By.CSS_SELECTOR, value='div>ul>li>a')[1].get_attribute('href')
except:
continue
driver_hotel = webdriver.Chrome('chromedriver',options=self.options)
driver_hotel.implicitly_wait(10)
driver_hotel.get(hotel_url)
kuchikomi_all = driver_hotel.find_elements(by=By.CSS_SELECTOR, value='div.jlnpc-kuchikomiCassette__contWrap')
if len(kuchikomi_all) < self.max_num:
_last_itr = len(kuchikomi_all)
else:
_last_itr = self.max_num
# 最新クチコミを取得
for count, kuchikomi_elm in enumerate(kuchikomi_all[:self.max_num], 1):
kuchikomi = {}
kuchikomi['宿名'] = name
kuchikomi['順位'] = rank
kuchikomi['ランキング種別'] = rank_type
kuchikomi['地域'] = area
kuchikomi['宿タイプ'] = stay_type
kuchikomi = self.get_kuchikomi_oldest(_last_itr, kuchikomi, count, kuchikomi_elm)
if not kuchikomi:
continue
kushikomi_list.append(kuchikomi)
# 最新口コミが1ページで収まっているとき
if not driver_hotel.find_elements(by=By.CLASS_NAME, value='page'):
# 過去どれだけ遡る必要があるか
past_kuchikomi_num = self.max_num - _last_itr
page_num = past_kuchikomi_num//30 + 1
if past_kuchikomi_num%30 == 0:
page_num -= 1
# 最新口コミが1ページで収まってないとき
else:
counter = 1
while driver_hotel.find_elements(by=By.CSS_SELECTOR, value='#kuchikomiArea > nav > a.next'):
_js_elem = driver_hotel.find_element(by=By.CSS_SELECTOR, value='#kuchikomiArea > nav > a.next')
js = _js_elem.get_attribute('onclick')
driver_hotel.execute_script(js)
kuchikomi_all = driver_hotel.find_elements(by=By.CSS_SELECTOR, value='div.jlnpc-kuchikomiCassette__contWrap')
if len(kuchikomi_all)+counter*30 < self.max_num:
_last_itr = len(kuchikomi_all)
else:
_last_itr = self.max_num
for count, kuchikomi_elm in enumerate(kuchikomi_all[:self.max_num], 1):
kuchikomi = {}
kuchikomi['宿名'] = name
kuchikomi['順位'] = rank
kuchikomi['ランキング種別'] = rank_type
kuchikomi['地域'] = area
kuchikomi['宿タイプ'] = stay_type
kuchikomi = self.get_kuchikomi_oldest(_last_itr, kuchikomi, count, kuchikomi_elm)
if not kuchikomi:
continue
counter += 1
# 過去どれだけ遡る必要があるか
past_kuchikomi_num = self.max_num - _last_itr - (counter-1)*30
page_num = past_kuchikomi_num//30 + 1
if past_kuchikomi_num%30 == 0:
page_num -= 1
if past_kuchikomi_num <= 0:
continue
driver_past = webdriver.Chrome('chromedriver',options=self.options)
driver_past.implicitly_wait(10)
driver_past.get(hotel_url)
# 口コミが複数ページに渡る場合は、忘れずに最後のページまで飛ばす
if driver_past.find_elements(by=By.CLASS_NAME, value='page'):
_js_elem = driver_past.find_element(by=By.CSS_SELECTOR, value='#kuchikomiArea > nav > a.last')
js = _js_elem.get_attribute('onclick')
driver_past.execute_script(js)
if not driver_past.find_elements(
by=By.CLASS_NAME, value='jlnpc-kuchikomi__pastLink'):
continue
elif not driver_past.find_elements(
by=By.CLASS_NAME, value='jlnpc-kuchikomi__pastLink'):
continue
# 過去の口コミを取得する
# jsを実行するために必要な宿IDをurlから取得する
_hotel_url = Path(hotel_url)
_yad_id = re.sub('[a-z]+', '', _hotel_url.parent.name)
page_count = 1
driver_past.execute_script(f"javascript:openPastKuchikomiList('',{_yad_id})")
print('------過去ログ取得開始------')
while page_count <= page_num:
kuchikomi_past_all = driver_past.find_elements(by=By.CLASS_NAME,
value='user-kuchikomi')
if page_count==page_num:
_last_num = past_kuchikomi_num%30
kuchikomi_past_all = kuchikomi_past_all[:_last_num]
print('口コミ取得中############', f'{page_count}/{page_num} ページ')
for past_kuchikomi in kuchikomi_past_all:
kuchikomi = {}
kuchikomi['宿名'] = name
kuchikomi['順位'] = rank
kuchikomi['ランキング種別'] = rank_type
kuchikomi['地域'] = area
kuchikomi['宿タイプ'] = stay_type
kuchikomi = self.get_kuchikomi_past(kuchikomi, past_kuchikomi)
if not kuchikomi:
continue
kushikomi_list.append(kuchikomi)
page_count += 1
driver_past.execute_script(f"javascript:nextPage('{page_count}', '{_yad_id}');")
# データフレームへマージする
df_kuchikomi = pd.DataFrame(kushikomi_list)
return df_kuchikomi
def get_kuchikomi_oldest(self, _last_itr, kuchikomi, count, kuchikomi_elm):
if self.verose:
print('口コミ取得中 ########### ', f'{count}/{_last_itr}')
# 口コミの取得
kuchikomi['投稿日'] = kuchikomi_elm.find_element(
by=By.CLASS_NAME,
value='jlnpc-kuchikomiCassette__postDate').text.split(':')[1]
try:
sex, _info = kuchikomi_elm.find_element(by=By.CLASS_NAME,
value='jlnpc-kuchikomiCassette__leftArea__contHead').text.split('/')
_, age, situation = _info.split(' ')
except:
sex = kuchikomi_elm.find_element(by=By.CLASS_NAME,
value='jlnpc-kuchikomiCassette__leftArea__contHead').text.split('/')
_info = None
age = None
situation = None
sex = re.sub(' ', '', sex)
kuchikomi['性別'] = sex
kuchikomi['年代'] = age
kuchikomi['シチュエーション'] = situation
price = kuchikomi_elm.find_elements(by=By.CLASS_NAME,
value='jlnpc-kuchikomiCassette__planInfoList')[2].text
kuchikomi['価格帯'] = price.split(' ')[1]
_points = kuchikomi_elm.find_element(by=By.CLASS_NAME,
value='jlnpc-kuchikomiCassette__rateList').text.split('\n')
# ---------- 評価点は過去口コミにないので除外する ---------- #
category = _points[::2]
points = _points[1::2]
for n in range(len(category)):
kuchikomi[category[n]] = int(points[n].replace('-', '0'))
try:
title = kuchikomi_elm.find_element(by=By.CLASS_NAME, value='jlnpc-kuchikomiCassette__lead').text
review = kuchikomi_elm.find_element(by=By.CLASS_NAME, value='jlnpc-kuchikomiCassette__postBody').text
except:
return None
kuchikomi['タイトル'] = title
ku
回答1件
あなたの回答
tips
プレビュー