PythonのスクレイピングでSSLエラー

##起こっていること

Pythonのrequestsを使ってグーグルの検索結果のタイトルやHタグを抜き出すスクレイピングを試みています。ソースコードは以下です。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import time


URL = 'https://www.google.co.jp'
URL_TITLE = 'Google'

# 2つのAPIを記述しないとリフレッシュトークンを3600秒毎に発行し続けなければならない
scope = [
    'https://spreadsheets.google.com/feeds',
    'https://www.googleapis.com/auth/drive'
]

# 認証情報設定
# ダウンロードしたjsonファイル名をクレデンシャル変数に設定（秘密鍵、Pythonファイルから読み込みしやすい位置に置く）
credentials = ServiceAccountCredentials.from_json_keyfile_name(
    'google-multi-search-〇〇〇〇〇.json', scope)

# 共有設定したスプレッドシートキーを格納
SPREADSHEET_KEY = '〇〇〇〇〇〇'


def main():
    '''
    メインの処理
    Googleでキーワードを検索
    １ページ目の情報を取得し、Googleスプレッドシートに出力
    '''

    with open('keyword.txt', encoding='UTF-8') as f:
        keywords = [s.rstrip() for s in f.readlines()] # 検索キーワードが入力されたテキストファイルを読み込む

    options = Options()
    options.add_argument('--headless') # ヘッドレスモードを有効にする
    driver = webdriver.Chrome(options=options) # ChromeのWebDriverオブジェクトを作成
    driver.get(URL) # Googleのトップページを開く
    time.sleep(2) # 2秒待機
    assert URL_TITLE in driver.title # タイトルに'Google'が含まれていることを確認

    for keyword in keywords:
        print('検索キーワード：' + keyword)

        # Google検索処理
        search(driver, keyword)

        # 情報取得処理
        items = get_info(driver, keyword)

        # Googleスプレッドシート出力処理
        count = 0
        while True:
            if count == 3:
                break
            else:
                count = googlespreadsheets(items, keyword, count)

    driver.quit() # ブラウザーを閉じる


def search(driver, keyword):
    '''
    検索テキストボックスに検索キーワードを入力し、検索する
    '''

    input_element = driver.find_element_by_name('q') # 検索テキストボックスの要素をname属性から取得
    input_element.clear() # 検索テキストボックスに入力されている文字列を消去
    input_element.send_keys(keyword) # 検索テキストボックスにキーワードを入力
    input_element.send_keys(Keys.RETURN) # Enterキーを送信
    time.sleep(2) # 2秒待機


def get_info(driver, keyword):
    '''
    情報を取得
    '''

    items_num = 0
    items = {
        'keyword': keyword,
        'title': [],
        'url': [],
        'description': [],
        'h1': [],
        'h2': [],
        'h3': [],
        'h4': [],
        'h5': [],
        'h6': []
    }

    # url
    urls = driver.find_elements_by_css_selector('.r > a')
    if urls:
        for url in urls:
            if 'translate' not in url.get_attribute('href'):
                items['url'].append(url.get_attribute('href').strip())

    # title
    titles = driver.find_elements_by_css_selector('.r > a > .LC20lb')
    if titles:
        for title in titles:
            items['title'].append(title.text.strip())

    # description
    descriptions = driver.find_elements_by_css_selector('.s > div > .st')
    if descriptions:
        for description in descriptions:
            items['description'].append(description.text.strip())

    # h1〜h6
    for url in items['url']:
        r = requests.get(url)
        soup = BeautifulSoup(r.content, 'lxml')
        time.sleep(1) # 1秒待機

        # h1
        h1s = soup.find_all('h1')
        h1_list = []
        for h1 in h1s:
            if h1.text.strip():
                h1_list.append(h1.text.strip())
        items['h1'].append(h1_list)

        # h2
        h2s = soup.find_all('h2')
        h2_list = []
        for h2 in h2s:
            if h2.text.strip():
                h2_list.append(h2.text.strip())
        items['h2'].append(h2_list)

        # h3
        h3s = soup.find_all('h3')
        h3_list = []
        for h3 in h3s:
            if h3.text.strip():
                h3_list.append(h3.text.strip())
        items['h3'].append(h3_list)

        # h4
        h4s = soup.find_all('h4')
        h4_list = []
        for h4 in h4s:
            if h4.text.strip():
                h4_list.append(h4.text.strip())
        items['h4'].append(h4_list)

        # h5
        h5s = soup.find_all('h5')
        h5_list = []
        for h5 in h5s:
            if h5.text.strip():
                h5_list.append(h5.text.strip())
        items['h5'].append(h5_list)

        # h6
        h6s = soup.find_all('h6')
        h6_list = []
        for h6 in h6s:
            if h6.text.strip():
                h6_list.append(h6.text.strip())
        items['h6'].append(h6_list)

    return items


def googlespreadsheets(items, keyword, count):
    '''
    Googleスプレッドシート出力
    '''

    # 制限
    # ①ユーザーごとに100秒あたり100件のリクエスト
    # ②1回のプログラムで設定できる最大値は1,000件まで
    # ③1秒あたり10件まで

    # OAuth2の資格情報を使用してGoogleAPIにログイン
    gc = gspread.authorize(credentials)

    # シートが作成されているか確認するためのフラグ
    flag = False

    try:
        # 共有設定したスプレッドシートのシート1を開く
        workbook = gc.open_by_key(SPREADSHEET_KEY)
        worksheet = workbook.add_worksheet(title=keyword, rows='100', cols='100')

        # シートが作成されたらフラグを立てる
        flag = True

        # スプレッドシート書き込み処理
        # キーワード
        worksheet.update_cell(1, 1, keyword)
        time.sleep(1) # 1秒待機

        # 順位
        ranking = 1
        row = 2
        column = 1
        for title in items['title']:
            worksheet.update_cell(row, column, ranking)
            ranking += 1
            column += 1
        time.sleep(3) # 3秒待機

        # 「タイトル」
        row = 3
        column = 1
        for title in items['title']:
            worksheet.update_cell(row, column, title)
            column += 1
        time.sleep(3) # 3秒待機

        # 「URL」
        row = 4
        column = 1
        for url in items['url']:
            worksheet.update_cell(row, column, url)
            column += 1
        time.sleep(3) # 3秒待機

        # 「ディスクリプション」
        row = 5
        column = 1
        for description in items['description']:
            worksheet.update_cell(row, column, description)
            column += 1
        time.sleep(3) # 3秒待機

        # 「h1」
        row = 6
        column = 1
        for h1s in items['h1']:
            if h1s:
                h1_str = '＊＊＊＊＊'.join(h1s)
                worksheet.update_cell(row, column, h1_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        # 「h2」
        row = 7
        column = 1
        for h2s in items['h2']:
            if h2s:
                h2_str = '＊＊＊＊＊'.join(h2s)
                worksheet.update_cell(row, column, h2_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        # 「h3」
        row = 8
        column = 1
        for h3s in items['h3']:
            if h3s:
                h3_str = '＊＊＊＊＊'.join(h3s)
                worksheet.update_cell(row, column, h3_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        # 「h4」
        row = 9
        column = 1
        for h4s in items['h4']:
            if h4s:
                h4_str = '＊＊＊＊＊'.join(h4s)
                worksheet.update_cell(row, column, h4_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        # 「h5」
        row = 10
        column = 1
        for h5s in items['h5']:
            if h5s:
                h5_str = '＊＊＊＊＊'.join(h5s)
                worksheet.update_cell(row, column, h5_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        # 「h6」
        row = 11
        column = 1
        for h6s in items['h6']:
            if h6s:
                h6_str = '＊＊＊＊＊'.join(h6s)
                worksheet.update_cell(row, column, h6_str)
                column += 1
            else:
                worksheet.update_cell(row, column, 'なし')
                column += 1
        time.sleep(3) # 3秒待機

        count = 3
        return count

    # エラー処理
    except gspread.exceptions.APIError as e:
        # 制限に達した場合
        if '"code": 429' in str(e):
            if flag:
                workbook.del_worksheet(worksheet)
            print('１００秒待機してリトライします')
            time.sleep(100) # 100秒待機
            count += 1
            return count
        # スプレッドシートに既にデータが存在している場合
        elif '"code": 400' in str(e):
            print('既に同じキーワードが存在します')
            count = 3
            return count


if __name__ == '__main__':
    main()

今までは何回か普通にスクレイピングできていたのですが、今回下のようなSSLエラーが起こりました。

terminal
1    raise SSLError(e, request=request)
2requests.exceptions.SSLError: HTTPSConnectionPool(host='www.isc.meiji.ac.jp', port=443): Max retries exceeded with url: /~mizutani/python/intro7_python.html (Caused by SSLError(SSLError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:833)'),))

実行環境
MacとWindowsの両方でやりました。
Mac OSX Mojave
Windows 10

##やってみたこと

下記のコードを追加してみたが、同じエラーが出てしまいます。

Python
1import ssl
2
3ssl._create_default_https_context = ssl._create_unverified_context

お分かりになる方いらっしゃいませんでしょうか？どうぞよろしくお願いいたします。

yodel

2019/07/18 12:03

「Pythonのrequestsを使ってグーグルの検索結果のタイトルやHタグを抜き出すスクレイピング」についてコード記載したほうが再現～原因確認しやすい気がします。　SSL認証で失敗するサイトがいくつかあるようにエラーログからは見えます。

退会済みユーザー

2019/07/18 12:09

そうですね。失礼しました。今ソースコードを記載いたしました。

yodel

2019/07/23 09:08

・今再実施しても再現しますか？・エラーが発生する接続先は常に同じですか？同じIP接続元から短時間に数回接続された場合にエラーとされていないかを気にしています。

行動規範の内容に同意します

回答2件

リクエストしている実装部分のソースコードが無いから
分からないですけれど、この辺の情報は参考になりませんか？

https://qiita.com/j_tamura/items/5a22b102a58d1fa93a78

投稿2019/07/18 05:19

yodel

総合スコア508

退会済みユーザー

2019/07/18 07:03

ありがとうございます。こちらも実行してみましたが、また同じエラーが出てしまいました。

行動規範の内容に同意します

利用規約に反した行動をとったためGoogleにブロックされてしまった可能性も考えられます。
利用規約を確認ください。

投稿2019/07/18 02:37

can110

総合スコア38262

hayataka2049

2019/07/18 04:54 編集

エラーが出たときの接続先がwww.isc.meiji.ac.jpなら、とりあえずグーグルは関係ないのでは。 ……なんでグーグル叩いて明治大学なんだろう？（検索結果のページを個別に叩いた？）どのみち、本当にグーグルを使っているならルール的にはだめですね。

can110

2019/07/18 05:11

あ、接続先を見逃してました。Google関係ないですね。 ↓のページを取得しようとしてたようですね https://www.isc.meiji.ac.jp/~mizutani/python/intro7_python.html

退会済みユーザー

2019/07/18 07:04 編集

すみません。私が説明不足でした。seleniumを使って「python 関数」というキーワードの1位から10位へそれぞれアクセスしてもらい、その個別のサイトごとにhttpリクエストをrequests関数を使ってかけていたという形でした。 https://www.isc.meiji.ac.jp/~mizutani/python/intro7_python.html のページだけからはじかれたという感じみたいですね。

行動規範の内容に同意します

あなたの回答