from lxml import etree
import pandas as pd
import requests
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'}
alldata = pd.DataFrame()
for k in range(0, 10):
print('正在爬取第%d页的数据' % (k + 1))
url = f'https://book.douban.com/top250?start={k * 25}'
response = requests.get(url, headers=headers, timeout=200)
text = response.text
dom = etree.HTML(text)
desc = dom.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/p[1]/text()')
href = dom.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/div[1]/a/@href')
title = dom.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/div[1]/a/@title')
rate = dom.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/div[2]/span[2]/text()')
comment_raw = dom.xpath('//*[@id="content"]/div/div[1]/div/table/tr/td[2]/p[2]/span/text()')
# 检查并填充 comment 列表
if len(comment_raw) < len(title):
comment = comment_raw + ["无评论"] * (len(title) - len(comment_raw))
else:
comment = comment_raw
author = [i.split('/')[0].strip() for i in desc]
publisher = [i.split('/')[-3].strip() for i in desc]
date = [i.split('/')[-2].strip() for i in desc]
price = [i.split('/')[-1].strip('元') for i in desc]
data = pd.DataFrame({
'name': title,
'url': href,
'author': author,
'publisher': publisher,
'date': date,
'price': price,
'rate': rate,
'comment': comment
})
alldata = pd.concat([alldata, data], axis=0, ignore_index=True)
alldata.to_csv('豆瓣Top250.csv',index =False,encoding='utf_8_sig')
爬取豆瓣网图书TOP250的数据(一共250本书),书名、链接、作者、出版社、出版时间、价格、评分、评语
最新推荐文章于 2024-04-26 18:35:17 发布