12from bs4 import BeautifulSoup
3from itertools import product
45# How to parse table with rowspan and colspan6# https://stackoverflow.com/questions/48393253/how-to-parse-table-with-rowspan-and-colspan7deftable_to_2d(table_tag, value_func):8 rowspans =[]# track pending rowspans9 rows = table_tag.find_all('tr')1011# first scan, see how many columns we need12 colcount =013for r, row inenumerate(rows):14 cells = row.find_all(['td','th'], recursive=False)15# count columns (including spanned).16# add active rowspans from preceding rows17# we *ignore* the colspan value on the last cell, to prevent18# creating 'phantom' columns with no actual cells, only extended19# colspans. This is achieved by hardcoding the last cell width as 1. 20# a colspan of 0 means “fill until the end” but can really only apply21# to the last cell; ignore it elsewhere. 22 colcount =max(23 colcount,24sum(int(c.get('colspan',1))or1for c in cells[:-1])+len(cells[-1:])+len(rowspans))25# update rowspan bookkeeping; 0 is a span to the bottom. 26 rowspans +=[int(c.get('rowspan',1))orlen(rows)- r for c in cells]27 rowspans =[s -1for s in rowspans if s >1]2829# it doesn't matter if there are still rowspan numbers 'active'; no extra30# rows to show in the table means the larger than 1 rowspan numbers in the31# last table row are ignored.3233# build an empty matrix for all possible cells34 table =[[None]* colcount for row in rows]3536# fill matrix from row data37 rowspans ={}# track pending rowspans, column number mapping to count38for row, row_elem inenumerate(rows):39 span_offset =0# how many columns are skipped due to row and colspans 40for col, cell inenumerate(row_elem.find_all(['td','th'], recursive=False)):41# adjust for preceding row and colspans42 col += span_offset
43while rowspans.get(col,0):44 span_offset +=145 col +=14647# fill table data48 rowspan = rowspans[col]=int(cell.get('rowspan',1))orlen(rows)- row
49 colspan =int(cell.get('colspan',1))or colcount - col
50# next column is offset by the colspan51 span_offset += colspan -15253# 値を取得54 value = value_func(cell)5556for drow, dcol in product(range(rowspan),range(colspan)):57try:58 table[row + drow][col + dcol]= value
59 rowspans[col + dcol]= rowspan
60except IndexError:61# rowspan or colspan outside the confines of the table62pass6364# update rowspan bookkeeping65 rowspans ={c: s -1for c, s in rowspans.items()if s >1}6667return table
6869# セルから値を取得70defget_value( cell):71 value = cell.get_text()72 a = cell.find('a')73if a:74 value += a.get('href')# とりあえず75return value
767778# テストデータ79# https://www.tablesgenerator.com/html_tables8081# +-------------+-------+------+82# | R1C12 | R12C3 | R1C4 |83# +------+------+ +------+84# | R2C1 | R2C2 | | R2C4 |85# +------+------+-------+------+86t1 ="""
87<table>
88<tbody>
89<tr>
90<td colspan="2">R1C12</td>
91<td rowspan="2">R12C3</td>
92<td>R1C4</td>
93</tr>
94<tr>
95<td>R2C1</td>
96<td>R2C2</td>
97<td>R2C4</td>
98</tr>
99</tbody>
100</table>
101"""102103# +-------+-------+104# | R12C1 | R1C2 |105# | +-------+106# | | R23C2 |107# +-------+ |108# | R3C1 | |109# +-------+-------+110t2 ="""
111<table>
112<tbody>
113 <tr>
114 <td rowspan="2">R12C1</td>
115 <td>R1C2</td>
116 </tr>
117 <tr>
118 <td rowspan="2">R23C2</td>
119 </tr>
120 <tr>
121 <td>R3C1</td>
122 </tr>
123</tbody>
124</table>
125"""126127base ="""
128<!DOCTYPE HTML>
129<html lang="ja">
130<head>
131<meta charset="utf-8">
132<title>test</title>
133<style>
134 table { border-collapse: collapse; }
135 tr { height: 2em; }
136 td { border: solid 1px black; }
137</style>
138</head>
139<body>
140{{table}}
141</body>
142</html>"""143144for t in(t1,t2):145 html = base.replace('{{table}}', t)146#print(html)147 soup = BeautifulSoup(html,"html.parser")148 table = soup.find('table').tbody
149 data = table_to_2d(table, get_value)150print('-----')151for i in data:152print(i)153154"""
155-----
156['R1C12', 'R1C12', 'R12C3', 'R1C4']
157['R2C1', 'R2C2', 'R12C3', 'R2C4']
158-----
159['R12C1', 'R1C2']
160['R12C1', 'R23C2']
161['R3C1', 'R23C2']
162"""