下面的代码会给你行你的页面的明智结果:
from bs4 import BeautifulSoup
import urllib.request
response = urllib.request.urlopen('file:///F:/test.html')
html = response.read()
soup = BeautifulSoup(html)
table = soup.find('table', attrs={'class': 'list hours responsive'})
rows = table.findAll('tr')
for tr in rows:
text = []
cols = tr.findAll('td')
for td in cols:
try:
text = ''.join(td.find(text=True))
except Exception:
text = "000"
print(text+",")
我测试的HTML页面保存为test.html的在F:驱动器
Thu, 11/02 | Fri, 12/02 | Sat, 13/02 | Sun, 14/02 | Mon, 15/02 | Tue, 16/02 | Wed, 17/02 | ||
---|---|---|---|---|---|---|---|---|
00 - 01 | €/MWh | 23.82 | 22.81 | 22.23 | 13.06 | 16.57 | 25.99 | 32.45 |
MWh | 10,266.0 | 9,626.6 | 12,255.9 | 11,084.7 | 11,039.5 | 13,134.7 | 9,958.1 | |
01 - 02 | €/MWh | 21.48 | 21.59 | 21.10 | 12.17 | 16.00 | 23.65 | 31.27 |
MWh | 9,843.3 | 9,494.4 | 11,823.3 | 10,531.9 | 9,970.5 | 12,875.6 | 9,958.8 | |
02 - 03 | €/MWh | 21.00 | 21.30 | 20.21 | 8.81 | 14.55 | 22.91 | 29.72 |
MWh | 9,857.0 | 9,427.9 | 11,755.2 | 10,061.9 | 9,881.7 | 12,841.0 | 9,896.9 | |
03 - 04 | €/MWh | 19.94 | 19.86 | 19.94 | 6.74 | 13.14 | 22.04 | 27.44 |
MWh | 9,486.2 | 10,492.7 | 12,609.1 | 11,216.6 | 10,199.9 | 11,209.7 | 9,698.5 |
输出的代码如下:
00 - 01,
€/MWh,
23.82,
22.81,
22.23,
13.06,
16.57,
25.99,
32.45,
,
MWh,
10,266.0,
9,626.6,
12,255.9,
11,084.7,
11,039.5,
13,134.7,
9,958.1,
01 - 02,
€/MWh,
21.48,
21.59,
21.10,
12.17,
16.00,
23.65,
31.27,
,
MWh,
9,843.3,
9,494.4,
11,823.3,
10,531.9,
9,970.5,
12,875.6,
9,958.8,
02 - 03,
€/MWh,
21.00,
21.30,
20.21,
8.81,
14.55,
22.91,
29.72,
,
MWh,
9,857.0,
9,427.9,
11,755.2,
10,061.9,
9,881.7,
12,841.0,
9,896.9,
03 - 04,
€/MWh,
19.94,
19.86,
19.94,
6.74,
13.14,
22.04,
27.44,
,
MWh,
9,486.2,
10,492.7,
12,609.1,
11,216.6,
10,199.9,
11,209.7,
9,698.5,