安装lxml
pip install lxml
代码
import requests
from bs4 import BeautifulSoup as bs
import time
import lxml
url = "https://bbs.hupu.com/bxj-postdate"
useragent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
header = {
'user-agent': useragent,
# 虎扑从第11页开始就必须登录才能查看
# 从浏览器端登录后直接复制cookie,会过时,以后运行还得重新复制一遍。
'cookie': 'your cookie'
}
for page in range(0,50):
page_url = url + '-' + str(page+1)
print(f'------------------ 第{page+1}页内容 {page_url}-------------------')
response = requests.get(page_url, headers=header)
last = time.time()
selector = lxml.etree.HTML(response.text)
ul_li = selector.xpath('//*[@id="ajaxtable"]/div[1]/ul/li')
for li in ul_li:
item = {
}
# 标题
title_box = li.xpath('./div[@class="titlelink box"]')[0]
item['link'] = hupu_domin + title_box.xpath('./a[@class="truetit"]/@href[1]')[0]
# 测试发现有些标题会用<b>标签包裹(具体见下一个单元格),需要特殊处理