import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import matplotlib.font_manager as fm
from requests.exceptions import RequestException, Timeout
import time
from collections import defaultdict
import os
from urllib.parse import urljoin
import random
# 创建存储海报的文件夹
if not os.path.exists('posters'):
os.makedirs('posters')
# 解决中文乱码问题 - 增强版字体设置(核心改进部分)
def setup_chinese_fonts():
"""设置支持中文的字体,兼容多系统,确保中文正常显示"""
# 中文字体候选列表,按优先级排序,增加更多备选字体
chinese_fonts = [
"SimHei", # Windows 黑体
"Microsoft YaHei", # Windows 雅黑
"WenQuanYi Micro Hei", # Linux 系统
"Heiti TC", # macOS 黑体
"Arial Unicode MS", # 通用备选
"SimSun", # 宋体
"NSimSun", # 新宋体
"SimKai", # 楷体
"FangSong", # 仿宋
"KaiTi", # 楷体_GB2312
"STSong", # 华文宋体
"STHeiti", # 华文黑体
"STKaiti", # 华文楷体
]
# 获取系统所有可用字体
system_fonts = [f.lower() for f in fm.findSystemFonts()]
# 查找可用的中文字体
available_font = None
for font in chinese_fonts:
for sys_font in system_fonts:
if font.lower() in sys_font:
# 找到匹配的字体文件
available_font = sys_font
break
if available_font:
break
if available_font:
# 使用找到的字体
font_prop = fm.FontProperties(fname=available_font)
font_name = font_prop.get_name()
plt.rcParams["font.family"] = [font_name]
print(f"已设置中文字体: {font_name}")
return font_name # 返回找到的字体名称
else:
# 备选方案:直接设置字体族
plt.rcParams["font.family"] = ["sans-serif", "SimHei", "WenQuanYi Micro Hei"]
print("警告: 未检测到中文字体,使用备选方案,可能影响显示效果")
return "sans-serif" # 返回默认字体族
# 初始化中文字体并保存当前字体
current_font = setup_chinese_fonts()
sns.set_style("whitegrid")
def download_poster(url, title, max_retries=2):
"""下载电影海报并保存到本地"""
try:
# 清理标题中的特殊字符,避免作为文件名出错
safe_title = re.sub(r'[\/:*?"<>|]', '', title)
file_path = f"posters/{safe_title}.jpg"
# 如果文件已存在,跳过下载
if os.path.exists(file_path):
return file_path
# 随机延迟避免被反爬
time.sleep(random.uniform(0.5, 1.5))
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"Referer": "https://movie.douban.com/"
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
with open(file_path, 'wb') as f:
f.write(response.content)
return file_path
except Exception as e:
print(f"下载海报失败 ({title}): {str(e)}")
return None
def crawl_douban_top250(timeout=10, retry=3, delay=2):
"""爬取豆瓣电影Top250数据,包含错误处理和重试机制"""
movies = []
base_url = "https://movie.douban.com/top250?start={}&filter="
# 增加更多User-Agent选择,降低被反爬概率
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"
]
for start in range(0, 250, 25):
url = base_url.format(start)
current_retry = 0
while current_retry < retry:
try:
# 随机选择User-Agent
headers = {
"User-Agent": random.choice(user_agents),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "https://movie.douban.com/",
"Cookie": "your_cookie_here" # 建议替换为实际Cookie
}
time.sleep(delay + random.uniform(-0.5, 0.5)) # 随机化延迟时间
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status() # 检查HTTP错误状态
soup = BeautifulSoup(response.text, "html.parser")
movie_items = soup.select(".grid_view li")
if not movie_items:
print(f"警告:第{start//25 + 1}页未找到电影数据")
break
for item in movie_items:
# 提取电影名称
title_tag = item.select_one(".title")
title = title_tag.text.strip() if title_tag else "未知标题"
# 提取评分
rating_tag = item.select_one(".rating_num")
rating = float(rating_tag.text.strip()) if (rating_tag and
rating_tag.text.strip().replace('.', '').isdigit()) else 0.0
# 提取详细信息(导演、主演、年份、国家、类型)
bd_tag = item.select_one(".bd p")
director = "未知导演"
actors = "未知主演"
year = "未知年份"
country = "未知国家/地区"
genre = "未知类型"
if bd_tag:
bd_lines = [line.strip() for line in bd_tag.text.strip().split("\n") if line.strip()]
# 处理导演和主演信息
if len(bd_lines) >= 1:
first_line = bd_lines[0]
if "导演:" in first_line:
dir_part, rest = first_line.split("导演:", 1)
director = rest.split("主演:", 1)[0].strip() if "主演:" in rest else rest.strip()
if "主演:" in rest:
actors_part = rest.split("主演:", 1)[1]
actors = actors_part.split("...")[0].strip() if actors_part.strip() else "未知主演"
else:
director = first_line[:30] # 防止过长文本
# 处理年份、国家和类型
if len(bd_lines) >= 2:
second_line = bd_lines[1]
year_match = re.search(r"\d{4}", second_line)
if year_match:
year = year_match.group()
remaining = second_line[year_match.end():].strip()
# 处理多种分隔符情况
separators = ["/", " ", "·"]
for sep in separators:
if sep in remaining:
parts = [p.strip() for p in remaining.split(sep) if p.strip()]
if len(parts) >= 1:
country = parts[0]
if len(parts) >= 2:
genre = parts[1]
break
if genre == "未知类型" and remaining:
genre = remaining.split()[0] if remaining.split() else remaining
# 提取评价人数
rating_count_tag = item.select_one(".star span:last-child")
rating_count = 0
if rating_count_tag:
count_text = rating_count_tag.text.strip()
count_match = re.search(r"(\d+,)*\d+", count_text)
if count_match:
rating_count = int(re.sub(",", "", count_match.group()))
# 提取排名和简介
rank_tag = item.select_one(".pic em")
rank = int(rank_tag.text.strip()) if (rank_tag and rank_tag.text.strip().isdigit()) else 0
quote_tag = item.select_one(".inq")
quote = quote_tag.text.strip() if quote_tag else "无简介"
# 提取海报URL
poster_tag = item.select_one(".pic img")
poster_url = poster_tag["src"] if poster_tag and "src" in poster_tag.attrs else None
poster_path = None
if poster_url:
poster_path = download_poster(poster_url, title)
# 提取电影详情页链接
link_tag = item.select_one(".pic a")
detail_url = link_tag["href"] if link_tag and "href" in link_tag.attrs else "未知"
# 只添加有效数据
if title != "未知标题" and rating > 0 and year != "未知年份":
movies.append({
"电影名称": title,
"评分": rating,
"导演": director,
"主演": actors,
"上映年份": year,
"国家/地区": country,
"电影类型": genre,
"评价人数": rating_count,
"排名": rank,
"简介": quote,
"海报路径": poster_path,
"详情链接": detail_url
})
print(f"成功爬取第{start//25 + 1}页,累计{len(movies)}部电影")
break
except Timeout:
current_retry += 1
print(f"请求超时({current_retry}/{retry}),重试中...")
except RequestException as e:
current_retry += 1
print(f"请求失败({current_retry}/{retry}):{str(e)},重试中...")
except Exception as e:
print(f"解析错误:{str(e)},跳过该电影")
continue
print(f"爬取完成,共获取{len(movies)}部有效电影数据")
return movies
def process_and_analyze_data(movies):
"""处理并分析电影数据,生成各种统计指标"""
if not movies:
raise ValueError("没有可分析的电影数据")
# 创建DataFrame并清洗数据
df = pd.DataFrame(movies)
# 去重
initial_count = len(df)
df = df.drop_duplicates(subset=["电影名称"], keep="first")
print(f"数据去重:原始{initial_count}条,去重后{len(df)}条")
# 数据类型转换
df["上映年份"] = pd.to_numeric(df["上映年份"], errors="coerce")
df["评价人数"] = pd.to_numeric(df["评价人数"], errors="coerce")
df["排名"] = pd.to_numeric(df["排名"], errors="coerce")
# 清理无效数据
df = df.dropna(subset=["上映年份", "评分", "电影名称"])
print(f"清理后剩余{len(df)}条有效记录")
# 基本评分统计
rating_stats = {
"平均评分": df["评分"].mean(),
"最高评分": df["评分"].max(),
"最低评分": df["评分"].min(),
"评分中位数": df["评分"].median()
}
# 按年代分组分析
df["年代"] = (df["上映年份"] // 10 * 10).astype(int)
decade_stats = df.groupby("年代")["评分"].agg(["mean", "count"]).sort_index()
# 高分电影年份分析
high_rating_years = df[df["评分"] >= 9.0]["上映年份"].value_counts().sort_index()
if high_rating_years.empty:
high_rating_years = pd.Series({pd.Timestamp.now().year: 0}, name="上映年份")
# 电影类型分析
genre_stats = defaultdict(lambda: {"count": 0, "total_rating": 0})
for _, row in df.iterrows():
genres = str(row["电影类型"]).split()
for genre in genres:
if genre and genre != "未知类型":
genre_stats[genre]["count"] += 1
genre_stats[genre]["total_rating"] += row["评分"]
# 计算类型平均评分
genre_count = {g: stats["count"] for g, stats in genre_stats.items()}
genre_rating = {g: stats["total_rating"]/stats["count"] for g, stats in genre_stats.items() if stats["count"] > 0}
# 国家/地区分析
country_details = defaultdict(int)
for countries in df["国家/地区"]:
for c in str(countries).split("/"):
if c.strip():
country_details[c.strip()] += 1
country_details = dict(sorted(country_details.items(), key=lambda x: x[1], reverse=True))
# 导演分析
director_stats = defaultdict(int)
for director in df["导演"]:
if director and director != "未知导演":
# 处理多位导演的情况
for d in director.split("/"):
d = d.strip()
if d:
director_stats[d] += 1
top_directors = dict(sorted(director_stats.items(), key=lambda x: x[1], reverse=True)[:10])
# 评分与评价人数相关性
valid_corr_data = df[(df["评价人数"] > 0) & (df["评分"] > 0)]
correlation = valid_corr_data["评分"].corr(valid_corr_data["评价人数"]) if len(valid_corr_data) >= 2 else 0.0
# TOP榜单
top10_rating = df.sort_values(by=["评分", "排名"], ascending=[False, True]).head(10)
top10_rating_count = df.sort_values(by=["评价人数", "评分"], ascending=[False, False]).head(10)
return (
df, rating_stats, decade_stats, high_rating_years,
genre_count, genre_rating, country_details, top_directors,
correlation, top10_rating, top10_rating_count
)
def visualize_data(df, rating_stats, decade_stats, genre_count, genre_rating,
country_details, top_directors, top10_rating, high_rating_years, correlation):
"""可视化分析结果,确保所有中文正常显示"""
# 使用全局设置的字体
global current_font
plot_font = {'fontproperties': fm.FontProperties(family=current_font)}
# 1. 评分分布直方图
plt.figure(figsize=(10, 6))
sns.histplot(df["评分"], bins=15, kde=True, color="#3498db")
plt.axvline(rating_stats["平均评分"], color='r', linestyle='dashed', linewidth=2,
label=f'平均分: {rating_stats["平均评分"]:.2f}')
plt.title("豆瓣Top250电影评分分布", fontsize=14, pad=20, **plot_font)
plt.xlabel("评分", fontsize=12,** plot_font)
plt.ylabel("电影数量", fontsize=12, **plot_font)
plt.legend(prop={'family': current_font})
plt.tight_layout()
plt.savefig("rating_distribution.png", dpi=300)
plt.close()
print("评分分布图已保存为 rating_distribution.png")
# 2. 各年代电影评分趋势
plt.figure(figsize=(12, 7))
sns.lineplot(x=decade_stats.index, y="mean", data=decade_stats, marker='o', color="#e74c3c")
plt.title("各年代豆瓣Top250电影平均评分趋势", fontsize=14, pad=20,** plot_font)
plt.xlabel("年代", fontsize=12, **plot_font)
plt.ylabel("平均评分", fontsize=12,** plot_font)
plt.ylim(8.0, 9.5)
for x, y in zip(decade_stats.index, decade_stats["count"]):
plt.text(x, 8.05, f"n={y}", ha="center", fontsize=9, **plot_font)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("decade_rating_trend.png", dpi=300)
plt.close()
print("年代评分趋势图已保存为 decade_rating_trend.png")
# 3. 电影类型数量分布
plt.figure(figsize=(12, 8))
top_genres = dict(sorted(genre_count.items(), key=lambda x: x[1], reverse=True)[:15])
sns.barplot(x=list(top_genres.values()), y=list(top_genres.keys()), palette="viridis")
plt.title("电影类型数量分布(前15)", fontsize=14, pad=20,** plot_font)
plt.xlabel("电影数量", fontsize=12, **plot_font)
plt.ylabel("类型", fontsize=12,** plot_font)
for i, v in enumerate(top_genres.values()):
plt.text(v + 0.5, i, str(v), va="center", **plot_font)
plt.tight_layout()
plt.savefig("genre_count.png", dpi=300)
plt.close()
print("类型数量分布图已保存为 genre_count.png")
# 4. 电影类型平均评分
plt.figure(figsize=(12, 8))
valid_genres = {g: r for g, r in genre_rating.items() if genre_count.get(g, 0) >= 5}
valid_genres = dict(sorted(valid_genres.items(), key=lambda x: x[1], reverse=True))
sns.barplot(x=list(valid_genres.values()), y=list(valid_genres.keys()), palette="plasma")
plt.title("电影类型平均评分(样本数≥5)", fontsize=14, pad=20,** plot_font)
plt.xlabel("平均评分", fontsize=12, **plot_font)
plt.ylabel("类型", fontsize=12,** plot_font)
plt.xlim(8.0, 9.5)
for i, v in enumerate(valid_genres.values()):
plt.text(v + 0.02, i, f"{v:.2f}", va="center", **plot_font)
plt.tight_layout()
plt.savefig("genre_rating.png", dpi=300)
plt.close()
print("类型评分图已保存为 genre_rating.png")
# 5. 国家/地区分布
plt.figure(figsize=(12, 8))
top_countries = dict(list(country_details.items())[:10])
sns.barplot(x=list(top_countries.values()), y=list(top_countries.keys()), palette="magma")
plt.title("电影国家/地区分布(前10)", fontsize=14, pad=20,** plot_font)
plt.xlabel("电影数量", fontsize=12, **plot_font)
plt.ylabel("国家/地区", fontsize=12,** plot_font)
for i, v in enumerate(top_countries.values()):
plt.text(v + 0.5, i, str(v), va="center", **plot_font)
plt.tight_layout()
plt.savefig("country_distribution.png", dpi=300)
plt.close()
print("国家/地区分布图已保存为 country_distribution.png")
# 6. 评分TOP10电影
plt.figure(figsize=(14, 7))
top10_sorted = top10_rating.sort_values("评分", ascending=True)
sns.barplot(x="评分", y="电影名称", data=top10_sorted, palette="rocket")
plt.title("评分TOP10电影", fontsize=14, pad=20,** plot_font)
plt.xlabel("评分", fontsize=12, **plot_font)
plt.ylabel("电影名称", fontsize=12,** plot_font)
plt.xlim(8.9, 9.7)
for i, v in enumerate(top10_sorted["评分"]):
plt.text(v + 0.01, i, f"{v:.1f}", va="center", **plot_font)
plt.tight_layout()
plt.savefig("top10_rating.png", dpi=300)
plt.close()
print("评分TOP10图已保存为 top10_rating.png")
# 7. 高分电影年份分布
plt.figure(figsize=(14, 7))
valid_years = high_rating_years[high_rating_years.index >= 1930]
if valid_years.empty:
valid_years = pd.Series({2000: 0}, name="上映年份")
sns.barplot(x=valid_years.index.astype(str), y=valid_years.values, palette="cubehelix")
plt.title("高分电影(≥9.0)年份分布", fontsize=14, pad=20, **plot_font)
plt.xlabel("年份", fontsize=12,** plot_font)
plt.ylabel("电影数量", fontsize=12, **plot_font)
plt.xticks(rotation=45, ha="right", fontproperties=fm.FontProperties(family=current_font))
for i, v in enumerate(valid_years.values):
if v > 0:
plt.text(i, v + 0.1, str(v), ha="center",** plot_font)
plt.tight_layout()
plt.savefig("high_rating_years.png", dpi=300)
plt.close()
print("高分电影年份图已保存为 high_rating_years.png")
# 8. 评分与评价人数相关性
plt.figure(figsize=(12, 8))
valid_data = df[(df["评价人数"] > 0) & (df["评分"] > 0)]
sns.scatterplot(x="评价人数", y="评分", data=valid_data, alpha=0.6, s=50)
sns.regplot(x="评价人数", y="评分", data=valid_data, scatter=False, color="red")
plt.title(f"评分与评价人数相关性(r={correlation:.3f})", fontsize=14, pad=20, **plot_font)
plt.xlabel("评价人数", fontsize=12,** plot_font)
plt.ylabel("评分", fontsize=12, **plot_font)
plt.xscale("log") # 使用对数刻度更清晰展示分布
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig("rating_vs_count.png", dpi=300)
plt.close()
print("评分与评价人数相关性图已保存为 rating_vs_count.png")
# 9. 热门导演分析
if top_directors:
plt.figure(figsize=(12, 7))
sns.barplot(x=list(top_directors.values()), y=list(top_directors.keys()), palette="mako")
plt.title("上榜次数最多的导演(前10)", fontsize=14, pad=20,** plot_font)
plt.xlabel("上榜电影数量", fontsize=12, **plot_font)
plt.ylabel("导演", fontsize=12,** plot_font)
for i, v in enumerate(top_directors.values()):
plt.text(v + 0.1, i, str(v), va="center", **plot_font)
plt.tight_layout()
plt.savefig("top_directors.png", dpi=300)
plt.close()
print("热门导演分析图已保存为 top_directors.png")
def main():
"""主函数:协调爬取、分析和可视化过程"""
print("=" * 60)
print(" 豆瓣电影Top250数据爬取与分析工具 ")
print("=" * 60)
try:
# 1. 爬取数据
print("\n【1/4】开始爬取豆瓣Top250数据...")
movies = crawl_douban_top250(timeout=15, retry=3)
if len(movies) < 100:
print("\n警告:爬取数据不足100部,可能受反爬限制")
if input("是否继续分析?(y/n):").strip().lower() != "y":
print("程序终止")
return
# 2. 处理与分析数据
print("\n【2/4】开始处理与分析数据...")
analysis_results = process_and_analyze_data(movies)
(
df, rating_stats, decade_stats, high_rating_years,
genre_count, genre_rating, country_details, top_directors,
correlation, top10_rating, top10_rating_count
) = analysis_results
# 3. 保存数据
df.to_csv("douban_top250.csv", index=False, encoding="utf-8-sig")
print(f"\n【3/4】数据已保存至 douban_top250.csv({len(df)}条记录)")
# 4. 可视化
print("\n【4/4】生成可视化图表...")
visualize_data(df, rating_stats, decade_stats, genre_count, genre_rating,
country_details, top_directors, top10_rating, high_rating_years, correlation)
# 输出核心结果
print("\n" + "=" * 60)
print(" 核心分析结果 ")
print("=" * 60)
print("1. 评分概况:")
for key, value in rating_stats.items():
print(f" - {key}:{value:.2f}")
print(f"\n2. 高分电影(≥9.0)最多的年份:")
if high_rating_years.max() > 0:
top_year = high_rating_years.idxmax()
print(f" - {top_year}年({high_rating_years.max()}部)")
print(f"\n3. 主要类型分析:")
if genre_count:
top_genre = max(genre_count.items(), key=lambda x: x[1])
top_rating_genre = max(genre_rating.items(), key=lambda x: x[1])
print(f" - 数量最多:{top_genre[0]}({top_genre[1]}部)")
print(f" - 评分最高:{top_rating_genre[0]}({top_rating_genre[1]:.2f}分)")
print(f"\n4. 国家/地区分析:")
if country_details:
top_country = next(iter(country_details.items()))
print(f" - 上榜最多:{top_country[0]}({top_country[1]}部)")
print(f"\n5. 导演分析:")
if top_directors:
top_dir = next(iter(top_directors.items()))
print(f" - 上榜最多:{top_dir[0]}({top_dir[1]}部)")
print(f"\n6. 相关性:评分与评价人数相关系数 {correlation:.3f}")
print(f"\n7. 评分前三电影:")
for i, (_, row) in enumerate(top10_rating.head(3).iterrows(), 1):
print(f" {i}. {row['电影名称']}({row['上映年份']}年,{row['评分']:.1f}分)")
print(f"\n8. 最受欢迎前三电影(评价人数):")
for i, (_, row) in enumerate(top10_rating_count.head(3).iterrows(), 1):
print(f" {i}. {row['电影名称']}({row['评价人数']:,}人评价)")
print("=" * 60)
print("\n分析完成!所有结果已保存到当前目录")
print(f"电影海报已保存到 {os.path.abspath('posters')} 文件夹")
except Exception as e:
print(f"\n程序出错:{str(e)}")
print("建议检查网络连接或更新Cookie后重试")
if __name__ == "__main__":
main()
修改上述代码电影名称乱码问题并生成新的完整的代码