# -- coding: utf-8 --
import sys
import random
import time
import re
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
from PyQt6.QtWidgets import (
QApplication, QMainWindow, QWidget, QVBoxLayout, QHBoxLayout,
QPushButton, QLabel, QProgressBar, QTableWidget, QTableWidgetItem,
QTextEdit, QFileDialog, QHeaderView, QTabWidget, QMessageBox,
QComboBox, QScrollArea
)
from PyQt6.QtCore import Qt, QThread, pyqtSignal
from PyQt6.QtGui import QFont, QColor
import matplotlib
matplotlib.use('Agg') # 非GUI后端
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS']
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
from wordcloud import WordCloud
import jieba
jieba.initialize()
from collections import Counter
import os
import platform
# ==================== 字体路径检测 ====================
def get_font_path():
system = platform.system()
if system == "Windows":
return r"C:\Windows\Fonts\simhei.ttf"
elif system == "Darwin": # macOS
return "/System/Library/Fonts/PingFang.ttc"
else:
return "/usr/share/fonts/truetype/wqy/wenquanyi-microhei.ttc"
FONT_PATH = get_font_path()
if not os.path.exists(FONT_PATH):
print(f"⚠️ 字体未找到: {FONT_PATH}")
else:
print(f"✅ 使用字体: {FONT_PATH}")
# ==================== 配色 ====================
C = {
"bg": QColor(255, 255, 255),
"primary": QColor(255, 179, 186),
"secondary": QColor(186, 220, 255),
"text": QColor(51, 51, 51),
}
def rgb(c):
return f"#{c.red():02x}{c.green():02x}{c.blue():02x}"
# ==================== 按钮样式 ====================
def create_button(text):
btn = QPushButton(text)
btn.setStyleSheet(f"""
QPushButton {{
background: qlineargradient(x1:0,y1:0,x2:1,y2:0,
stop:0 {rgb(C['primary'])}, stop:1 {rgb(C['secondary'])});
color: {rgb(C['text'])};
font-size: 14px;
font-weight: bold;
border: none;
border-radius: 8px;
padding: 8px 16px;
}}
QPushButton:hover {{
background: {rgb(C['secondary'])};
}}
QPushButton:disabled {{
background: #eeeeee;
color: #999999;
}}
""")
return btn
# ==================== 爬虫线程 ====================
class CrawlThread(QThread):
progress_signal = pyqtSignal(int)
log_signal = pyqtSignal(str)
data_signal = pyqtSignal(list)
finish_signal = pyqtSignal()
def __init__(self):
super().__init__()
self.is_running = True
def run(self):
movies = []
headers_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0)"
]
for page in range(10):
if not self.is_running:
break
try:
time.sleep(random.uniform(0.5, 1.0))
url = "https://movie.douban.com/top250"
params = {"start": page * 25}
headers = {"User-Agent": random.choice(headers_list)}
resp = requests.get(url, headers=headers, params=params, timeout=10)
resp.encoding = 'utf-8'
if resp.status_code != 200:
self.log_signal.emit(f"第{page+1}页失败: {resp.status_code}")
continue
soup = BeautifulSoup(resp.text, 'html.parser')
items = soup.find_all('div', class_='item')
for item in items:
try:
title_tag = item.find('span', class_='title')
rating_tag = item.find('span', class_='rating_num')
info_p = item.find('div', class_='bd').find('p').get_text('\n', strip=True)
quote_tag = item.find('span', class_='inq')
name = title_tag.text.strip() if title_tag else "未知电影"
rating = float(rating_tag.text.strip()) if rating_tag else 0.0
year_match = re.search(r'(19\d{2}|20[012]\d)', info_p)
lines = [line.strip() for line in info_p.split('\n') if line.strip()]
country_line = ""
for line in reversed(lines):
if '/' in line and not re.search(r'\d{4}\s*$', line):
country_line = line
break
parts = [p.strip() for p in country_line.split('/')]
country = "未知"
for p in parts:
p = re.sub(r'^\d{4}', '', p).strip()
if len(p) > 1 and not p.isdigit() and re.match(r'^[a-zA-Z\u4e00-\u9fa5]', p):
country = p
break
director_actor = info_p.split('\n')[0].strip()
summary = quote_tag.text.strip() if quote_tag else ""
movie = {
"name": name,
"rating": rating,
"year": int(year_match.group(1)) if year_match else 0,
"country": country,
"director_actor": director_actor,
"summary": summary,
"full_text": f"{name} {summary}"
}
movies.append(movie)
except Exception as e:
continue
current_count = len(movies)
self.progress_signal.emit(min(int(current_count / 250 * 100), 100))
self.log_signal.emit(f"第{page+1}页完成 → 抓取 {current_count}/250")
except Exception as e:
self.log_signal.emit(f"网络错误: {str(e)[:30]}")
# 补全数据至250条
while len(movies) < 250:
movies.append({
"name": "数据缺失",
"rating": 0.0,
"year": 0,
"country": "未知",
"director_actor": "",
"summary": "",
"full_text": ""
})
# ✅ 关键修复:按评分降序 + 年份升序 排列(确保高分靠前,同分按年份)
movies.sort(key=lambda x: (-x["rating"], x["year"] if x["year"] > 0 else 9999))
self.data_signal.emit(movies)
self.finish_signal.emit()
# ==================== 可视化图表生成器(增强版,多图+防重叠)====================
from matplotlib.backends.backend_qtagg import FigureCanvasQTAgg as FigureCanvas
def generate_rating_line_chart(movies):
"""折线图:8.0~10.0 每0.1一格,趋势清晰"""
ratings = [m["rating"] for m in movies if 8.0 <= m["rating"] <= 10.0]
bins = np.arange(8.0, 10.1, 0.1)
hist, _ = np.histogram(ratings, bins=bins)
fig, ax = plt.subplots(figsize=(10, 4), dpi=100)
ax.plot(bins[:-1], hist, marker='o', linestyle='-', color=C["primary"].name(), linewidth=2, markersize=4)
ax.set_title("📈 评分分布趋势图(8.0~10.0,步长0.1)", fontsize=14, fontweight='bold', pad=10)
ax.set_xlabel("评分", fontsize=12)
ax.set_ylabel("数量", fontsize=12)
ax.grid(True, alpha=0.3)
ax.set_xticks(bins[::2])
plt.setp(ax.get_xticklabels(), rotation=0)
plt.tight_layout(pad=1.0)
return FigureCanvas(fig)
def generate_wordcloud(movies):
"""词云图:关键词提取"""
texts = [m["full_text"] for m in movies if m["full_text"].strip()]
text = " ".join(texts)
words = jieba.lcut(text)
stopwords = {"电影", "影片", "故事", "这部", "一个", "没有", "我们", "自己", "导演", "主演", "一部"}
words = [w for w in words if len(w) > 1 and w not in stopwords and re.match(r'^[\u4e00-\u9fa5a-zA-Z]+$', w)]
freq = Counter(words).most_common(100)
if not freq:
freq = [("经典", 10)]
wc = WordCloud(
font_path=FONT_PATH,
width=800,
height=400,
background_color="white",
max_words=100
).generate_from_frequencies(dict(freq))
fig, ax = plt.subplots(figsize=(10, 4), dpi=100)
ax.imshow(wc, interpolation='bilinear')
ax.axis("off")
ax.set_title("☁️ 高频词汇词云", fontsize=14, pad=10)
plt.tight_layout(pad=1.0)
return FigureCanvas(fig)
def generate_year_histogram(movies):
"""新增柱状图:年份分布"""
years = [m["year"] for m in movies if m["year"] > 0]
fig, ax = plt.subplots(figsize=(10, 4), dpi=100)
ax.hist(years, bins=range(1950, 2030, 5), color=C["secondary"].name(), edgecolor='black', alpha=0.7)
ax.set_title("📅 电影年份分布直方图(每5年一档)", fontsize=14, fontweight='bold', pad=10)
ax.set_xlabel("年份", fontsize=12)
ax.set_ylabel("数量", fontsize=12)
ax.grid(True, axis='y', alpha=0.3)
plt.tight_layout(pad=1.0)
return FigureCanvas(fig)
def generate_country_pie_chart(movies):
"""新增饼图:国家/地区占比"""
countries = [m["country"] for m in movies if m["country"] != "未知"]
counter = Counter(countries).most_common(8)
others = sum(count for _, count in Counter(countries).items()) - sum(count for _, count in counter)
labels = [c[0] for c in counter] + (["其他"] if others > 0 else [])
sizes = [c[1] for c in counter] + ([others] if others > 0 else [])
fig, ax = plt.subplots(figsize=(10, 4), dpi=100)
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=plt.cm.Pastel1.colors)
ax.set_title("🌍 主要出品国家/地区占比", fontsize=14, pad=10)
plt.tight_layout(pad=1.0)
return FigureCanvas(fig)
def generate_rating_boxplot(movies):
"""新增箱型图:评分分布统计"""
ratings = [m["rating"] for m in movies if m["rating"] > 0]
fig, ax = plt.subplots(figsize=(10, 4), dpi=100)
ax.boxplot(ratings, vert=False, patch_artist=True,
boxprops=dict(facecolor=C["primary"].name()),
medianprops=dict(color="red"))
ax.set_title("📦 评分箱型图(中位数、异常值等)", fontsize=14, fontweight='bold', pad=10)
ax.set_xlabel("评分")
plt.tight_layout(pad=1.0)
return FigureCanvas(fig)
# ==================== 主窗口 ====================
class MainWindow(QMainWindow):
def __init__(self):
super().__init__()
self.setWindowTitle("豆瓣Top250 - 评分趋势分析")
self.resize(1000, 700)
self.movies_data = []
self.filtered_data = []
self.crawl_thread = None
self.init_ui()
def init_ui(self):
container = QWidget()
self.setCentralWidget(container)
layout = QVBoxLayout(container)
title = QLabel("📊 豆瓣Top250 评分趋势分析")
title.setAlignment(Qt.AlignmentFlag.AlignCenter)
title.setStyleSheet(f"color: {rgb(C['primary'])}; font-size: 24px; font-weight: bold;")
layout.addWidget(title)
# 控制按钮
ctrl_layout = QHBoxLayout()
self.start_btn = create_button("开始爬取")
self.stop_btn = create_button("停止")
self.export_btn = create_button("导出Excel")
self.filter_btn = create_button("应用筛选")
self.reset_btn = create_button("重置")
self.stop_btn.setEnabled(False)
self.export_btn.setEnabled(False)
self.filter_btn.setEnabled(False)
self.reset_btn.setEnabled(False)
ctrl_layout.addWidget(self.start_btn)
ctrl_layout.addWidget(self.stop_btn)
ctrl_layout.addWidget(self.export_btn)
ctrl_layout.addWidget(self.filter_btn)
ctrl_layout.addWidget(self.reset_btn)
layout.addLayout(ctrl_layout)
self.progress_bar = QProgressBar()
layout.addWidget(self.progress_bar)
self.log_text = QTextEdit()
self.log_text.setMaximumHeight(80)
self.log_text.setReadOnly(True)
layout.addWidget(self.log_text)
# 筛选条件
filter_layout = QHBoxLayout()
filter_layout.addWidget(QLabel("最低评分:"))
self.min_rate = QComboBox()
self.min_rate.addItems([f"{x:.1f}" for x in np.arange(8.0, 10.0, 0.1)])
self.min_rate.setCurrentText("8.0")
filter_layout.addWidget(self.min_rate)
filter_layout.addWidget(QLabel("最高评分:"))
self.max_rate = QComboBox()
self.max_rate.addItems([f"{x:.1f}" for x in np.arange(8.1, 10.1, 0.1)])
self.max_rate.setCurrentText("10.0")
filter_layout.addWidget(self.max_rate)
filter_layout.addWidget(QLabel("起始年份:"))
self.min_year = QComboBox()
self.min_year.addItems([str(y) for y in range(1900, 2025)])
self.min_year.setCurrentText("1900")
filter_layout.addWidget(self.min_year)
filter_layout.addWidget(QLabel("结束年份:"))
self.max_year = QComboBox()
self.max_year.addItems([str(y) for y in range(1901, 2031)])
self.max_year.setCurrentText("2030")
filter_layout.addWidget(self.max_year)
layout.addLayout(filter_layout)
# 表格
self.table = QTableWidget()
self.table.setColumnCount(7)
self.table.setHorizontalHeaderLabels(["排名", "名称", "评分", "年份", "地区", "主创", "简介"])
header = self.table.horizontalHeader()
header.setSectionResizeMode(QHeaderView.ResizeMode.Stretch)
layout.addWidget(self.table)
# 多标签页
self.tabs = QTabWidget()
self.tabs.addTab(self.table, "数据")
log_tab = QWidget()
log_tab.setLayout(QVBoxLayout())
log_tab.layout().addWidget(self.log_text)
self.tabs.addTab(log_tab, "日志")
chart_tab = QWidget()
scroll = QScrollArea()
scroll.setWidgetResizable(True)
content = QWidget()
self.chart_layout = QVBoxLayout(content)
scroll.setWidget(content)
self.tabs.addTab(scroll, "📊 可视化")
layout.addWidget(self.tabs)
# 信号连接
self.start_btn.clicked.connect(self.start_crawl)
self.stop_btn.clicked.connect(self.stop_crawl)
self.export_btn.clicked.connect(self.export_excel)
self.filter_btn.clicked.connect(self.apply_filter)
self.reset_btn.clicked.connect(self.reset_filter)
def start_crawl(self):
if self.crawl_thread and self.crawl_thread.isRunning():
return
self.crawl_thread = CrawlThread()
self.crawl_thread.progress_signal.connect(self.progress_bar.setValue)
self.crawl_thread.log_signal.connect(self.log_text.append)
self.crawl_thread.data_signal.connect(self.on_data_received)
self.crawl_thread.finish_signal.connect(self.on_crawl_finished)
self.crawl_thread.start()
self.start_btn.setEnabled(False)
self.stop_btn.setEnabled(True)
def on_data_received(self, movies):
self.movies_data = movies
self.filtered_data = movies.copy()
self.update_table()
def update_table(self):
self.table.setRowCount(0)
for i, m in enumerate(self.filtered_data):
self.table.insertRow(i)
self.table.setItem(i, 0, QTableWidgetItem(str(i + 1))) # 正确排名
self.table.setItem(i, 1, QTableWidgetItem(m["name"]))
self.table.setItem(i, 2, QTableWidgetItem(f"{m['rating']:.1f}"))
self.table.setItem(i, 3, QTableWidgetItem(str(m["year"]) if m["year"] > 0 else "未知"))
self.table.setItem(i, 4, QTableWidgetItem(m["country"]))
self.table.setItem(i, 5, QTableWidgetItem(m["director_actor"][:20]))
self.table.setItem(i, 6, QTableWidgetItem(m["summary"][:30]))
def on_crawl_finished(self):
self.start_btn.setEnabled(True)
self.stop_btn.setEnabled(False)
self.export_btn.setEnabled(True)
self.filter_btn.setEnabled(True)
self.reset_btn.setEnabled(True)
self.log_text.append("🎉 爬取完成!共获取 250 部电影")
self.apply_filter() # 自动刷新筛选和图表
def stop_crawl(self):
if self.crawl_thread and self.crawl_thread.isRunning():
self.crawl_thread.is_running = False
self.log_text.append("🛑 正在停止...")
def apply_filter(self):
try:
min_r = float(self.min_rate.currentText())
max_r = float(self.max_rate.currentText())
min_y = int(self.min_year.currentText())
max_y = int(self.max_year.currentText())
self.filtered_data = [
m for m in self.movies_data
if min_r <= m["rating"] <= max_r and (m["year"] == 0 or min_y <= m["year"] <= max_y)
]
# ✅ 再次排序:确保筛选后仍有序
self.filtered_data.sort(key=lambda x: (-x["rating"], x["year"] if x["year"] > 0 else 9999))
self.update_table()
self.show_visualization()
self.log_text.append(f"🔍 筛选完成:{len(self.filtered_data)} 部符合条件")
except Exception as e:
QMessageBox.warning(self, "筛选错误", str(e))
def reset_filter(self):
self.filtered_data = self.movies_data.copy()
self.update_table()
self.show_visualization()
self.log_text.append("🔄 已重置筛选")
def show_visualization(self):
# 清除旧图表
while self.chart_layout.count():
child = self.chart_layout.takeAt(0)
if child.widget():
child.widget().deleteLater()
if not self.filtered_data:
return
# 添加多个图表,并用空白间隔开(避免紧贴)
self.chart_layout.addWidget(generate_rating_line_chart(self.filtered_data))
self.chart_layout.addSpacing(20)
self.chart_layout.addWidget(generate_year_histogram(self.filtered_data))
self.chart_layout.addSpacing(20)
self.chart_layout.addWidget(generate_country_pie_chart(self.filtered_data))
self.chart_layout.addSpacing(20)
self.chart_layout.addWidget(generate_rating_boxplot(self.filtered_data))
self.chart_layout.addSpacing(20)
self.chart_layout.addWidget(generate_wordcloud(self.filtered_data))
self.chart_layout.addStretch()
def export_excel(self):
if not self.filtered_data:
QMessageBox.warning(self, "警告", "暂无数据可导出")
return
path, _ = QFileDialog.getSaveFileName(self, "导出Excel", "豆瓣Top250.xlsx", "Excel文件 (*.xlsx)")
if not path:
return
try:
wb = Workbook()
ws = wb.active
ws.title = "豆瓣Top250"
ws.append(["排名", "名称", "评分", "年份", "地区", "主创", "简介"])
for i, m in enumerate(self.filtered_data, 1):
ws.append([
i, m["name"], m["rating"],
m["year"] if m["year"] > 0 else "未知",
m["country"], m["director_actor"], m["summary"]
])
for col in ws.columns:
col_letter = col[0].column_letter
ws.column_dimensions[col_letter].width = 15
wb.save(path)
QMessageBox.information(self, "成功", f"✅ 已导出至:\n{path}")
except PermissionError:
QMessageBox.critical(self, "权限错误", "请关闭同名文件再试")
except Exception as e:
QMessageBox.critical(self, "导出失败", str(e))
# ==================== 启动 ====================
if __name__ == "__main__":
app = QApplication(sys.argv)
app.setStyle("Fusion")
app.setFont(QFont("微软雅黑", 9))
win = MainWindow()
win.show()
sys.exit(app.exec())可视化图可以分成几个正方形方框,这样不挤一起,导致看不完,给我完整代码