beautifulsoupでtableのスクレイピングがしたい

タイトルの通りスクレイピングを試みているのですが、なかなかうまくいきません。
もともとうまくいっていたところに、リンクのURLもリストに追加したいと考え後付けで加えたところ、リストの先頭が上書きされてしまうようになってしまいました。

python3
1import requests
2from bs4 import BeautifulSoup
3
4#table→list
5def to_list(table):
6    trs = table.find_all("tr")
7
8    tr_max = len(trs)
9
10    data = [[] for _ in range(tr_max)]
11
12    for i, tr in enumerate(trs):
13
14        for td in tr.find_all("td"):
15            a = td.find("a")
16            if a is None:                
17                cell = td.get_text(strip=True)
18                rowspan = td.get('rowspan')
19                colspan = td.get('colspan')
20                row = int(rowspan) if rowspan else 1
21                col = int(colspan) if colspan else 1
22                
23                for j in range(row):
24                    for x in range(col):
25                        data[i + j].append(cell)
26            else:
27                rowspan = td.get('rowspan')
28                colspan = td.get('colspan')
29                url = a.get("href")
30                row = int(rowspan) if rowspan else 1
31                col = int(colspan) if colspan else 1
32                
33                for j in range(row):
34                    for x in range(col):
35                        data[i + j].append(url)
36    return data
37
38
39
40
41url = 'https://www.wakayama-med.ac.jp/hospital/gairai/schedule/index.html'
42page = requests.get(url)
43
44soup = BeautifulSoup(page.content, "html.parser")
45table = soup.find("table", {"class":"bordered"}).tbody
46
47code = "A1312970051"
48
49data = to_list(table)
50for i in data:
51    print(i)
52
53

まだ途中の段階ですが、どうすればうまくいくのでしょうか。

python
1else:
2                rowspan = td.get('rowspan')
3                colspan = td.get('colspan')
4                url = a.get("href")
5                row = int(rowspan) if rowspan else 1
6                col = int(colspan) if colspan else 1
7                
8                for j in range(row):
9                    for x in range(col):
10                        data[i + j].append(url)

ここを追加してからおかしいのでこの部分が間違っているだろうと思いますが、どのようにすればURLで上書きするのではなく、URLを追加することができますか。
ちなみにURLがない行は""を追加したいですが、それもわかりません。
よろしくお願いいたします。

meg_

2021/04/29 11:44

> リストの先頭が上書きされてしまうようになってしまいました。上記の意味が分かりませんでした。現在、dataの中身はどうなっていて、想定の結果はどうなっていたのでしょうか？リストの先頭数個で良いので追記されると回答が付きやすくなるかと思います。

行動規範の内容に同意します

回答1件

現状、どのような入力の結果、どのようにまずいのかがよく分かりませんが
まずテーブルを二次元リストに格納する処理においてrowspan,colspanにちゃんと対応できていないようです。
この処理はHow to parse table with rowspan and colspanを参考に書き換えます。
さらに

URLで上書きするのではなく、URLを追加

というのもよくわからないのですが
td要素から値を取得する処理を別関数に分離して見通しをよくします。

Python
1
2from bs4 import BeautifulSoup
3from itertools import product
4
5# How to parse table with rowspan and colspan
6# https://stackoverflow.com/questions/48393253/how-to-parse-table-with-rowspan-and-colspan
7def table_to_2d(table_tag, value_func):
8    rowspans = []  # track pending rowspans
9    rows = table_tag.find_all('tr')
10
11    # first scan, see how many columns we need
12    colcount = 0
13    for r, row in enumerate(rows):
14        cells = row.find_all(['td', 'th'], recursive=False)
15        # count columns (including spanned).
16        # add active rowspans from preceding rows
17        # we *ignore* the colspan value on the last cell, to prevent
18        # creating 'phantom' columns with no actual cells, only extended
19        # colspans. This is achieved by hardcoding the last cell width as 1. 
20        # a colspan of 0 means “fill until the end” but can really only apply
21        # to the last cell; ignore it elsewhere. 
22        colcount = max(
23            colcount,
24            sum(int(c.get('colspan', 1)) or 1 for c in cells[:-1]) + len(cells[-1:]) + len(rowspans))
25        # update rowspan bookkeeping; 0 is a span to the bottom. 
26        rowspans += [int(c.get('rowspan', 1)) or len(rows) - r for c in cells]
27        rowspans = [s - 1 for s in rowspans if s > 1]
28
29    # it doesn't matter if there are still rowspan numbers 'active'; no extra
30    # rows to show in the table means the larger than 1 rowspan numbers in the
31    # last table row are ignored.
32
33    # build an empty matrix for all possible cells
34    table = [[None] * colcount for row in rows]
35
36    # fill matrix from row data
37    rowspans = {}  # track pending rowspans, column number mapping to count
38    for row, row_elem in enumerate(rows):
39        span_offset = 0  # how many columns are skipped due to row and colspans 
40        for col, cell in enumerate(row_elem.find_all(['td', 'th'], recursive=False)):
41            # adjust for preceding row and colspans
42            col += span_offset
43            while rowspans.get(col, 0):
44                span_offset += 1
45                col += 1
46
47            # fill table data
48            rowspan = rowspans[col] = int(cell.get('rowspan', 1)) or len(rows) - row
49            colspan = int(cell.get('colspan', 1)) or colcount - col
50            # next column is offset by the colspan
51            span_offset += colspan - 1
52
53            # 値を取得
54            value = value_func(cell)
55
56            for drow, dcol in product(range(rowspan), range(colspan)):
57                try:
58                    table[row + drow][col + dcol] = value
59                    rowspans[col + dcol] = rowspan
60                except IndexError:
61                    # rowspan or colspan outside the confines of the table
62                    pass
63
64        # update rowspan bookkeeping
65        rowspans = {c: s - 1 for c, s in rowspans.items() if s > 1}
66
67    return table
68
69# セルから値を取得
70def get_value( cell):
71    value = cell.get_text()
72    a = cell.find('a')
73    if a:
74        value += a.get('href') # とりあえず
75    return value
76
77
78# テストデータ
79# https://www.tablesgenerator.com/html_tables
80
81# +-------------+-------+------+
82# | R1C12       | R12C3 | R1C4 |
83# +------+------+       +------+
84# | R2C1 | R2C2 |       | R2C4 |
85# +------+------+-------+------+
86t1 = """
87<table>
88<tbody>
89<tr>
90<td colspan="2">R1C12</td>
91<td rowspan="2">R12C3</td>
92<td>R1C4</td>
93</tr>
94<tr>
95<td>R2C1</td>
96<td>R2C2</td>
97<td>R2C4</td>
98</tr>
99</tbody>
100</table>
101"""
102
103# +-------+-------+
104# | R12C1 | R1C2  |
105# |       +-------+
106# |       | R23C2 |
107# +-------+       |
108# | R3C1  |       |
109# +-------+-------+
110t2 = """
111<table>
112<tbody>
113  <tr>
114    <td rowspan="2">R12C1</td>
115    <td>R1C2</td>
116  </tr>
117  <tr>
118    <td rowspan="2">R23C2</td>
119  </tr>
120  <tr>
121    <td>R3C1</td>
122  </tr>
123</tbody>
124</table>
125"""
126
127base = """
128<!DOCTYPE HTML>
129<html lang="ja">
130<head>
131<meta charset="utf-8">
132<title>test</title>
133<style>
134  table { border-collapse: collapse; }
135  tr { height: 2em; }
136  td { border: solid 1px black; }
137</style>
138</head>
139<body>
140{{table}}
141</body>
142</html>"""
143
144for t in (t1,t2):
145    html = base.replace('{{table}}', t)
146    #print(html)
147    soup = BeautifulSoup(html, "html.parser")
148    table = soup.find('table').tbody
149    data = table_to_2d(table, get_value)
150    print('-----')
151    for i in data:
152        print(i)
153
154"""
155-----
156['R1C12', 'R1C12', 'R12C3', 'R1C4']
157['R2C1', 'R2C2', 'R12C3', 'R2C4']
158-----
159['R12C1', 'R1C2']
160['R12C1', 'R23C2']
161['R3C1', 'R23C2']
162"""