import requests
from bs4 import BeautifulSoup
import csv
import time
# 目标URL(豆瓣电影Top250)
base_url = "https://movie.douban.com/top250"
# 配置请求头(模拟浏览器访问)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
# 创建CSV文件存储数据
with open('..\douban_top250.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['排名', '电影名称', '评分', '短评数量', '经典台词'])
# 分页爬取(豆瓣每页25条,共10页)
for page in range(0, 10):
url = f"{base_url}?start={page*25}"
# 发送HTTP请求
response = requests.get(url, headers=headers)
# 检查请求是否成功
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 解析电影条目
items = soup.find_all('div', class_='item')
for item in items:
# 提取数据
rank = item.find('em').text # 排名
title = item.find('span', class_='title').text # 中文名
rating = item.find('span', class_='rating_num').text # 评分
comments = item.find('div', class_='bd').find_all('div')[0].find_all('span')[-1].text.replace('人评价', '') # 评论数
quote = item.find('p', class_='quote').text if item.find('p', class_='quote') else "无" # 经典台词
# 写入CSV
writer.writerow([rank, title, rating, comments, quote])
print(f"第 {page+1} 页数据抓取完成")
time.sleep(2) # 礼貌性延时
else:
print(f"请求失败,状态码:{response.status_code}")
print("全部数据已保存到 douban_top250.csv")
简单的爬虫Demo(豆瓣)
最新推荐文章于 2025-05-04 17:10:36 发布