編集履歴

質問編集履歴

作成中に誤って送信してしまったので補足しました。

2021/11/15 13:40

投稿

BigCulture

スコア2

test CHANGED Viewed

File without changes

test CHANGED Viewed

@@ -14,17 +14,25 @@
+### 情報の取得方法
+URLの取得 → クラスの抽出 → `a`の抽出 → `href`の抽出 → URLを絶対表記に置換
 ### 処理の流れ
-1. トップページから情報を抽出する
+1.トップページから情報を抽出する
 > https://www.sej.co.jp/products/
-2. 商品カテゴリ別に情報を格納した後、関東地方にリンクを限定する
+2.商品カテゴリ別に情報を格納した後、関東地方にリンクを限定する
 > 'https://www.sej.co.jp/products/a/onigiri/kanto/',
@@ -34,19 +42,305 @@
-3. 関東地方だけに設定した商品リンクから、情報を抽出する
-### 情報の取得方法
-URLの取得 → クラスの抽出 → `a`の抽出 → `href`の抽出 → URLを絶対表記に置換
-情報の格納先 → `contents`
+3.関東地方だけに設定した商品リンクから、情報を抽出する
+### 該当のソースコード
+```Python
+import re
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+from time import sleep
+# URLを取得する関数
+def GetUrl(GetTheURL):
+    # 取得したいURL
+    url = GetTheURL
+    global r
+    # urlを引数に指定して、HTTPリクエストを送信してHTMLを取得、取得したデータを変数 r に格納
+    r = requests.get(url)
+    # 格納したデータの文字コードを自動でエンコーディング
+    r.encoding = r.apparent_encoding
+    sleep(2)
+# 取得したいURLからクラスを抽出する関数
+def GetSoupClass(GetClass, value):
+    soup = BeautifulSoup(r.text, 'html.parser')
+    global contents
+    content = []
+    # find or find_allを設定するための値
+    value == 0
+    if value == 0:
+        contents = soup.find(class_= GetClass)
+    elif value == 1:
+        contents = soup.find_all(class_= GetClass)
+    else:
+        # ここをindex[3,5,7,9]番目だけ抽出したい
+        contents = soup.find_all(class_= GetClass)[3]
+        content.append(contents)
+        contents = content
+# soupで入手した値から a を抽出する関数
+def Find_a(FindContents):
+    global ProductList
+    ProductList = []
+    for i in range(len(FindContents)):
+        content = FindContents[i].find('a')
+        ProductList.append(content)
+# soupで取得した値から href を抽出する関数
+def GetHref(hrefValue):
+    global ProductLink
+    ProductLink = []
+    for i in range(len(hrefValue)):
+        link_ = hrefValue[i].get('href')
+        ProductLink.append(link_)
+# URLを相対表記から絶対表記に置換する関数
+def ReplaceURL(Before, After):
+    global ProductLink
+    ProductLink = [item.replace(Before, After) for item in ProductLink]
+GetUrl("https://www.sej.co.jp/products/")
+GetSoupClass("sideCategoryNav", 0)
+# HTMLから商品情報が格納されている a タグを全て表示
+get_a = contents.find_all('a')
+GetHref(get_a)
+ReplaceURL("/products", "https://www.sej.co.jp/products")
+# 関東地方のみに商品カテゴリを絞り込む
+ProductLinkKanto = []
+for i in range(len(ProductLink)):
+    text = ProductLink[i]
+    # URLの末尾にkanto/を追加
+    Newtext = re.sub('$',"kanto/",text)
+    ProductLinkKanto.append(Newtext)
+# 今週の新商品と来週の新商品は内容が重複するため削除する
+ProductLink =  ProductLinkKanto[2:18]
+# 取得したリンク先に、カテゴリ別の表記を格納する
+# カテゴリ別の表記がない場合は、リンクをそのまま格納する
+ProductList = []
+for i in range(len(ProductLink)):
+    GetUrl(ProductLink[i])
+    if i < 2:
+        GetSoupClass("list_btn brn pbNested pbNestedWrapper", 1)
+    elif i < 3:
+        GetSoupClass("list_btn pbNested pbNestedWrapper", 1)
+    elif i == 4 or i == 6 or i >= 10 and i <= 13:
+        GetSoupClass("pbBlock pbBlockBase", 2)
+    else:
+        contents = ProductLink[i]
+    ProductList.append(contents)
+```
+### 試したこと
+`GetSoupClass`の
+```GetSoupClass
+    if value == 0:
+        contents = soup.find(class_= GetClass)
+    elif value == 1:
+        contents = soup.find_all(class_= GetClass)
+    else:
+        # 3,5,7,9番目だけ抽出する
+        contents = soup.find_all(class_= GetClass)[3]
+        content.append(contents)
+        contents = content
+```
+の部分について、奇数番号のみを取得したかったので
+```
+    for i in range(10):
+        if value == 2 and i % 2 == 1:
+            contents = soup.find_all(class_= GetClass)[i]
+            content.append(contents)
+        elif value == 1:
+            contents = soup.find_all(class_= GetClass)
+        else:
+            contents = soup.find(class_= GetClass)
+        contents = content
+```
+と変更して試してみました。
@@ -56,251 +350,49 @@
 ```
-```
-### 該当のソースコード
-```Python
-import re
-import requests
-import pandas as pd
-from bs4 import BeautifulSoup
-from time import sleep
-# URLを取得する関数
-def GetUrl(GetTheURL):
-    # 取得したいURL
-    url = GetTheURL
-    global r
-    # urlを引数に指定して、HTTPリクエストを送信してHTMLを取得、取得したデータを変数 r に格納
-    r = requests.get(url)
-    # 格納したデータの文字コードを自動でエンコーディング
-    r.encoding = r.apparent_encoding
-    sleep(2)
-# 取得したいURLからクラスを抽出する関数
-def GetSoupClass(GetClass, value):
-    soup = BeautifulSoup(r.text, 'html.parser')
-    global contents
-    content = []
-    # find or find_allを設定するための値
-    value == 0
-    if value == 0:
-        contents = soup.find(class_= GetClass)
-    elif value == 1:
-        contents = soup.find_all(class_= GetClass)
-    else:
-        # ここをindex[3,5,7,9]番目だけ抽出したい
-        contents = soup.find_all(class_= GetClass)[3]
-        content.append(contents)
-        contents = content
-# soupで取得した値から href を抽出する関数
-def GetHref(hrefValue):
-    global ProductLink
-    ProductLink = []
-    for i in range(len(hrefValue)):
-        link_ = hrefValue[i].get('href')
-        ProductLink.append(link_)
-# soupで入手した値から a を抽出する関数
-def Find_a(FindContents):
-    global ProductList
-    ProductList = []
-    for i in range(len(FindContents)):
-        content = FindContents[i].find('a')
-        ProductList.append(content)
-# URLを相対表記から絶対表記に置換する関数
-def ReplaceURL(Before, After):
-    global ProductLink
-    ProductLink = [item.replace(Before, After) for item in ProductLink]
-GetUrl("https://www.sej.co.jp/products/")
-GetSoupClass("sideCategoryNav", 0)
-# HTMLから商品情報が格納されている a タグを全て表示
-get_a = contents.find_all('a')
-GetHref(get_a)
-ReplaceURL("/products", "https://www.sej.co.jp/products")
-# 関東地方のみに商品カテゴリを絞り込む
-ProductLinkKanto = []
-for i in range(len(ProductLink)):
-    text = ProductLink[i]
-    # URLの末尾にkanto/を追加
-    Newtext = re.sub('$',"kanto/",text)
-    ProductLinkKanto.append(Newtext)
-# 今週の新商品と来週の新商品は内容が重複するため削除する
-ProductLink =  ProductLinkKanto[2:18]
-# 取得したリンク先に、カテゴリ別の表記を格納する
-# カテゴリ別の表記がない場合は、リンクをそのまま格納する
-ProductList = []
-for i in range(len(ProductLink)):
-    GetUrl(ProductLink[i])
-    if i < 2:
-        GetSoupClass("list_btn brn pbNested pbNestedWrapper", 1)
-    elif i < 3:
-        GetSoupClass("list_btn pbNested pbNestedWrapper", 1)
-    elif i == 4 or i == 6 or i >= 10 and i <= 13:
-        GetSoupClass("pbBlock pbBlockBase", 2)
-    else:
-        contents = ProductLink[i]
-    ProductList.append(contents)
-```
-### 試したこと
-ここに問題に対して試したことを記載してください。
+```
+---------------------------------------------------------------------------
+IndexError                                Traceback (most recent call last)
+<ipython-input-36-eb08a21b7137> in <module>
+     13         GetSoupClass("list_btn pbNested pbNestedWrapper", 1)
+     14     elif i == 4 or i == 6 or i >= 10 and i <= 13:
+---> 15         GetSoupClass("pbBlock pbBlockBase", 2)
+     16     else:
+     17         contents = ProductLink[i]
+<ipython-input-35-c1265b5f2c6f> in GetSoupClass(GetClass, value)
+     13     for i in range(10):
+     14         if value == 2 and i % 2 == 1:
+---> 15             contents = soup.find_all(class_= GetClass)[i]
+     16             content.append(contents)
+     17         elif value == 1:
+IndexError: list index out of range
+```
+となってしまいました。これらの解決策についてご教授いただけると幸いです。
@@ -308,4 +400,12 @@
+python 3.8.8
+### 最後に
-ここにより詳細な情報を記載してください。
+プログラミング初心者のため拙いコードでの質問になりますが何卒よろしくお願い致します。