简单的爬虫Demo（豆瓣）

最新推荐文章于 2025-05-04 17:10:36 发布

疯狂的攻城狮-Alun

最新推荐文章于 2025-05-04 17:10:36 发布

阅读量613

点赞数 1

文章标签：爬虫

本文链接：https://blog.youkuaiyun.com/Alun_kong/article/details/146319485

版权

import requests
from bs4 import BeautifulSoup
import csv
import time

# 目标URL（豆瓣电影Top250）
base_url = "https://movie.douban.com/top250"

# 配置请求头（模拟浏览器访问）
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}

# 创建CSV文件存储数据
with open('..\douban_top250.csv', 'w', newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    writer.writerow(['排名', '电影名称', '评分', '短评数量', '经典台词'])

    # 分页爬取（豆瓣每页25条，共10页）
    for page in range(0, 10):
        url = f"{base_url}?start={page*25}"
        
        # 发送HTTP请求
        response = requests.get(url, headers=headers)
        
        # 检查请求是否成功
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # 解析电影条目
            items = soup.find_all('div', class_='item')
            
            for item in items:
                # 提取数据
                rank = item.find('em').text  # 排名
                title = item.find('span', class_='title').text  # 中文名
                rating = item.find('span', class_='rating_num').text  # 评分
                comments = item.find('div', class_='bd').find_all('div')[0].find_all('span')[-1].text.replace('人评价', '')  # 评论数
                quote = item.find('p', class_='quote').text if item.find('p', class_='quote') else "无"  # 经典台词
                
                # 写入CSV
                writer.writerow([rank, title, rating, comments, quote])
            
            print(f"第 {page+1} 页数据抓取完成")
            time.sleep(2)  # 礼貌性延时
        else:
            print(f"请求失败，状态码：{response.status_code}")

print("全部数据已保存到 douban_top250.csv")