Python Webスクレイピングにおけるscheduleモジュール実行時のエラー

Question

### 前提・実現したいこと Pythonを活用したWebスクレイピング処理に関して、 scheduleモジュールを活用した定時処理を行いたいと考えています。 Webスクレイピング用の関数単体では問題なく動作するのですが、 scheduleモジュールで処理した際にエラーが出てしまいます。関数の中にtry、exceptを組み込んでいる影響でしょうか。 ※if elseに書き換えても同様のメッセージが発生しましたので更新したコードを記載いたします。 ### 発生している問題・エラーメッセージ --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) in () 2 3 while True: ----> 4 schedule.run_pending() 5 time.sleep(10) 4 frames /usr/local/lib/python3.7/dist-packages/schedule/__init__.py in run_pending() 590 :data:`default scheduler instance `. 591 """ --> 592 default_scheduler.run_pending() 593 594 /usr/local/lib/python3.7/dist-packages/schedule/__init__.py in run_pending(self) 92 runnable_jobs = (job for job in self.jobs if job.should_run) 93 for job in sorted(runnable_jobs): ---> 94 self._run_job(job) 95 96 def run_all(self, delay_seconds=0): /usr/local/lib/python3.7/dist-packages/schedule/__init__.py in _run_job(self, job) 145 146 def _run_job(self, job): --> 147 ret = job.run() 148 if isinstance(ret, CancelJob) or ret is CancelJob: 149 self.cancel_job(job) /usr/local/lib/python3.7/dist-packages/schedule/__init__.py in run(self) 489 """ 490 logger.debug('Running job %s', self) --> 491 ret = self.job_func() 492 self.last_run = datetime.datetime.now() 493 self._schedule_next_run() in scrayping() 13 14 page_1_b = soup1.find("div", {"class":"main-inner-b"}) ---> 15 title_1_b = page_1_b.find("h3").text.replace("\u3000","").replace(" ","") 16 overview_1_b = page_1_b.find("p").text.replace("\u3000","").replace(" ","") 17 link_1_b = page_1_b.find("a").get("href") AttributeError: 'NoneType' object has no attribute 'find' ### 該当のソースコード全文記載させていただきます。・関数の定義・キーワードのインプット受け付け・スケジュールモジュールで定時処理といった構成です。長文失礼いたします。 ```python def scrayping(): #page_1 result1 = requests.get(url1) c1 = result1.content soup1 = BeautifulSoup(c1) if soup1.find("div", {"class":"main-inner-a"}) is None: title_1_a = [] overview_1_a = [] link_1_a = [] else: page_1_a = soup1.find("div", {"class":"main-inner-a"}) title_1_a = page_1_a.find("h3").text.replace("\u3000","").replace(" ","") overview_1_a = page_1_a.find("p").text.replace("\u3000","").replace(" ","") link_1_a = page_1_a.find("a").get("href") if soup1.find("div", {"class":"main-inner-b"}) is None: title_1_b = [] overview_1_b = [] link_1_b = [] else: page_1_b = soup1.find("div", {"class":"main-inner-b"}) title_1_b = page_1_b.find("h3").text.replace("\u3000","").replace(" ","") overview_1_b = page_1_b.find("p").text.replace("\u3000","").replace(" ","") link_1_b = page_1_b.find("a").get("href") title_1_c = [] overview_1_c = [] link_1_c = [] try: #cが8個ない場合に対応 for i in range(8): page_1_c = soup1.find_all("div", {"class":"main-inner-c"})[i] tmp_title = page_1_c.find("h3").text.replace("\u3000","").replace(" ","") title_1_c.append(tmp_title) tmp_overview = page_1_c.find_all("p")[2].text.replace("\u3000","").replace(" ","") overview_1_c.append(tmp_overview) tmp_link = page_1_c.find("a").get("href") link_1_c.append(tmp_link) i += 1 except: pass title_1_all = pd.DataFrame([title_1_a, title_1_b] + title_1_c) overview_1_all = pd.DataFrame([overview_1_a, overview_1_b] + overview_1_c) link_1_all = pd.DataFrame([link_1_a, link_1_b] + link_1_c) page1_df = pd.concat([title_1_all, overview_1_all, link_1_all], axis=1) #page_2 result2 = requests.get(url2) c2 = result2.content soup2 = BeautifulSoup(c2) title_2_c = [] overview_2_c = [] link_2_c = [] try: #cが8個ない場合に対応 for i in range(10): page_2_c = soup2.find_all("div", {"class":"main-inner-c"})[i] tmp_title = page_2_c.find("h3").text.replace("\u3000","").replace(" ","") title_2_c.append(tmp_title) tmp_overview = page_2_c.find_all("p")[2].text.replace("\u3000","").replace(" ","") overview_2_c.append(tmp_overview) tmp_link = page_2_c.find("a").get("href") link_2_c.append(tmp_link) i += 1 except: pass title_2_all = pd.DataFrame(title_2_c) overview_2_all = pd.DataFrame(overview_2_c) link_2_all = pd.DataFrame(link_2_c) page2_df = pd.concat([title_2_all, overview_2_all, link_2_all], axis=1) #page_3 result3 = requests.get(url3) c3 = result3.content soup3 = BeautifulSoup(c3) title_3_c = [] overview_3_c = [] link_3_c = [] try: #cが8個ない場合に対応 for i in range(10): page_3_c = soup3.find_all("div", {"class":"main-inner-c"})[i] tmp_title = page_3_c.find("h3").text.replace("\u3000","").replace(" ","") title_3_c.append(tmp_title) tmp_overview = page_3_c.find_all("p")[2].text.replace("\u3000","").replace(" ","") overview_3_c.append(tmp_overview) tmp_link = page_3_c.find("a").get("href") link_3_c.append(tmp_link) i += 1 except: pass title_3_all = pd.DataFrame(title_3_c) overview_3_all = pd.DataFrame(overview_3_c) link_3_all = pd.DataFrame(link_3_c) page3_df = pd.concat([title_3_all, overview_3_all, link_3_all], axis=1) #total page_df = pd.concat([page1_df, page2_df, page3_df], axis=0,ignore_index=True) page_df.columns=["Title","Overview","Link"] page_df.index = np.arange(1, len(page_df)+1) page_df.to_excel("result.xlsx") today = datetime.datetime.now() today = str(today.year) + "_" + str(today.month) + "_" + str(today.day) + "_" + str(today.hour) path1 = 'result.xlsx' path2 = today + "_" + keyword + "_" + period + "_" +'result.xlsx' os.rename(path1, path2) #キーワード受け付け keyword = input("検索ワードを入力してください。") #Free period = input("monthかweekかdayを入力してください。") #[month, week, day] url1 = "https://runda.jp/search/?q=" + keyword + "&page=1&search_type=" + period url2 = "https://runda.jp/search/?q=" + keyword + "&page=2&search_type=" + period url3 = "https://runda.jp/search/?q=" + keyword + "&page=3&search_type=" + period #繰り返し自動処理 schedule.every(1).day.at("10:30").do(scrayping) while True: schedule.run_pending() time.sleep(10) ``` ### 試したこと関数の定義、関数単体での実行は問題なく作動したのですが scheduleで処理した際に上手くいかない状態です。

Accepted Answer

エラーメッセージを見る限り、
page_1_bがNoneであるのに、page_1_b.findメソッドを実行しようとしてエラーが起きた。
となっていますので、tryが拾っていないようです。

この部分をtryではなく、if page_1_b != None:に変更して再現するかどうかを調べてみてはいかがでしょうか。

```python
  page_1_b = soup1.find("div", {"class":"main-inner-b"})
  if page_1_b != None: #bがある場合
    title_1_b = page_1_b.find("h3").text.replace("\u3000","").replace("
","")
    overview_1_b = page_1_b.find("p").text.replace("\u3000","").replace("
","")
    link_1_b = page_1_b.find("a").get("href")
  else:
    title_1_b = []
    overview_1_b = []
    link_1_b = []
```

前提・実現したいこと

発生している問題・エラーメッセージ

該当のソースコード

試したこと

関連した質問