以下代码唯独电影类型没有爬取到,导致词云图无法生成,改写代码:import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import time
import random
import re
def get_movie_data():
base_url = "https://movie.douban.com/top250"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Referer": "https://www.douban.com/",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
movies = []
for start in range(0, 250, 25):
url = f"{base_url}?start={start}&filter=&type="
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
for item in soup.select("div.item"):
title = item.select_one("span.title").text
rating = item.select_one("span.rating_num").text
# 提取导演和主演信息
info_text = item.select_one("div.bd p").get_text(strip=True)
director = "未知"
actors = "未知"
# 使用正则表达式直接匹配导演和主演
director_match = re.search(r'导演:\s*(.*?)(?:\s+主演:|$)', info_text)
if director_match:
director = director_match.group(1).strip().split('/')[0] # 防止多个导演用斜杠分隔
actor_match = re.search(r'主演:\s*(.*?)(?:\s+\d{4}|/)', info_text)
if actor_match:
actor_list = [actor.strip() for actor in actor_match.group(1).split('/')]
actors = " ".join(actor_list) # 用空格分隔演员
# 修正类型提取代码,豆瓣电影页面类型的正确选择器是 "span[property='v:genre']"
genres = [genre.text for genre in item.select("span[property='v:genre']")] or ["未知"]
movies.append({
"title": title,
"rating": float(rating),
"director": director,
"actors": actors,
"genres": genres
})
print(f"第{(start // 25) + 1}页爬取完成")
else:
print(f"请求失败,状态码: {response.status_code}")
time.sleep(random.uniform(0.5,1)) # 延迟 0.5~1 秒
except Exception as e:
print(f"请求异常: {e}")
df = pd.DataFrame(movies, columns=["title", "rating", "director", "actors", "genres"])
print("原始数据预览:")
print(df.head())
return df
def generate_wordcloud(text, filename):
try:
if not text.strip():
print(f"警告:{filename} 的文本为空,跳过生成词云")
return
wordcloud = WordCloud(
font_path="simhei.ttf", # 确保字体文件存在
width=800,
height=400,
background_color="white",
max_words=200
).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig(filename)
plt.show()
except Exception as e:
print(f"生成 {filename} 词云时出错: {e}")
if __name__ == "__main__":
df = get_movie_data()
# 1. 按评分排序并保存到Excel
df_sorted = df.sort_values(by="rating", ascending=False)
df_sorted[["title", "rating"]].to_excel("豆瓣电影Top250_评分排序_1.xlsx", index=False)
print("评分排序的Excel文件已保存。")
# 2. 生成词云图
# 提取电影名称、主演和类型的文本
all_titles = " ".join(df_sorted["title"])
all_actors = " ".join(df_sorted["actors"].astype(str))
all_genres = " ".join([" ".join(genres) for genres in df["genres"] if isinstance(genres, list)])
# 调试输出
print("电影名称文本示例:", all_titles[:200])
print("主演文本示例:", all_actors[:200])
print("类型文本示例:", all_genres[:200])
# 生成词云
generate_wordcloud(all_titles, "电影名称词云.png")
generate_wordcloud(all_actors, "主演词云.png")
generate_wordcloud(all_genres, "电影类型词云.png")
最新发布