"""
《乱世佳人》文学文本分析工具 —— 专业增强版
功能亮点:
- ✨ 深色/浅色主题切换(护眼模式)
- 📤 一键导出 Markdown 分析报告
- 😊 基于 VADER 的文学情感分析(比 TextBlob 更准)
- 🔗 高效人物共现关系构建(使用 itertools 优化)
- ⚡ 多线程 + 缓存,避免重复计算
- 🎨 精致 UI:卡片式摘要、分隔线、加载指示器
作者:AI 助手 | 日期:2025-12-09
"""
import tkinter as tk
from tkinter import ttk, scrolledtext, filedialog, messagebox
import re
from collections import Counter
import threading
import hashlib
import os
from itertools import combinations
# === 第三方库 ===
try:
from ttkthemes import ThemedTk
except ImportError:
raise ImportError("请安装 ttkthemes: pip install ttkthemes")
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
# 使用 VADER 替代 TextBlob,更适合文学/口语化文本
try:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
VADER_AVAILABLE = True
except ImportError:
VADER_AVAILABLE = False
print("警告:未安装 vaderSentiment,将回退到基础情感提示。")
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import networkx as nx
# === 全局配置 ===
CHARACTERS = [
"scarlett", "rhett", "melanie", "ashley", "ellen", "gerald",
"pitty", "archie", "frank", "bonnie", "suellen", "mammy"
]
ANALYZERS = [
{"name": "basic", "title": "📊 基本统计"},
{"name": "freq", "title": "📈 词频分析"},
{"name": "sentiment", "title": "😊 情感分析"},
{"name": "chars", "title": "👥 人物识别"},
{"name": "relations", "title": "🔗 人物关系"},
]
UI_CONFIG = {
"font_normal": ("Segoe UI", 10),
"font_bold": ("Segoe UI", 10, "bold"),
"card_padding": (12, 8),
"button_font": ("Segoe UI", 9),
}
# === 工具函数 ===
def clean_text(text):
"""
清理原始文本:转小写、移除非字母字符、分词、过滤停用词。
Args:
text (str): 原始输入文本
Returns:
list[str]: 清洗后的单词列表
"""
text = re.sub(r'[^a-z\s]', ' ', text.lower())
tokens = [
w for w in word_tokenize(text)
if w not in stopwords.words('english') and len(w) > 2
]
return tokens
def split_sentences(text):
"""
使用 NLTK 将英文文本按句子分割,并转为小写。
Args:
text (str): 原始文本
Returns:
list[str]: 小写句子列表
"""
return sent_tokenize(text.lower())
def validate_input(text):
"""
验证输入是否为有效英文文学文本。
条件:
- 非空且长度 ≥10
- 英文字母占比 ≥30%
- 英文字母总数 ≥10
Returns:
bool: 是否有效
"""
if not (text := text.strip()):
return False
if len(text) < 10:
return False
en_chars = sum(1 for c in text if 'a' <= c.lower() <= 'z')
alpha_total = max(len([c for c in text if c.isalpha()]), 1)
return en_chars >= 10 and en_chars / alpha_total >= 0.3
def read_file_smart(path):
"""尝试多种编码读取文件,优先 UTF-8"""
for enc in ['utf-8-sig', 'utf-8', 'gbk', 'latin1']:
try:
with open(path, encoding=enc) as f:
return f.read()
except UnicodeDecodeError:
continue
raise ValueError("无法读取文件编码,请使用纯文本(.txt)文件。")
def clean_markup(text):
"""
移除常见标记语言残留,提升分析准确性。
- HTML 标签:<div>...</div>
- Markdown 链接:[text](url) 或图片 
"""
# 移除 HTML 标签
text = re.sub(r'<[^>]+>', ' ', text)
# 移除 Markdown 链接和图片语法
text = re.sub(r'\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)', ' ', text)
# 合并多个空白符为单个空格
return re.sub(r'\s+', ' ', text).strip()
# === 缓存系统 ===
class Cache:
"""轻量级内存缓存,避免重复分析相同文本"""
def __init__(self):
self._cache = {}
def key(self, text):
"""生成文本的短哈希键(16位)"""
return hashlib.md5(text.encode('utf-8', errors='ignore')).hexdigest()[:16]
def get(self, k, name):
return self._cache.get((k, name))
def set(self, k, name, data):
self._cache[(k, name)] = data
CACHE = Cache()
# === 分析引擎(独立模块)===
def basic_stats(text):
"""返回基本统计信息:总词数、唯一词数"""
words = clean_text(text)
return {"total": len(words), "unique": len(set(words))}
def word_freq(text):
"""返回高频词列表及完整计数器(用于词云)"""
words = clean_text(text)
counter = Counter(words)
return {"top": counter.most_common(30), "counter": counter}
def sentiment(text):
"""使用 VADER 进行情感分析(若不可用则返回提示)"""
if not VADER_AVAILABLE:
return {
"mood": "⚠️ 模型缺失",
"desc": "请安装 vaderSentiment 以启用精准情感分析:pip install vaderSentiment",
"compound": None,
"detail": {}
}
analyzer = SentimentIntensityAnalyzer()
scores = analyzer.polarity_scores(text)
compound = scores['compound']
if compound >= 0.05:
mood = "积极乐观 😊"
elif compound <= -0.05:
mood = "消极悲观 😞"
else:
mood = "中性平和 😐"
return {
"mood": mood,
"desc": "基于 VADER 模型(专为文学、口语化文本设计)",
"compound": round(compound, 3),
"detail": scores
}
def characters(text):
"""统计预设人物在文本中的出现次数"""
low = text.lower()
return {c: low.count(c) for c in CHARACTERS if c in low}
def relations(text):
"""
构建人物共现关系网络。
优化点:
- 仅当句子提及 ≥2 人时才计算组合
- 使用 itertools.combinations 避免手写双循环
"""
pairs = Counter()
for s in split_sentences(text):
mentioned = [c for c in CHARACTERS if c in s]
if len(mentioned) < 2:
continue
# 对提及人物排序后生成无向边
for a, b in combinations(sorted(mentioned), 2):
pairs[(a, b)] += 1
return {"pairs": pairs}
# === 摘要生成器 ===
def generate_summary(name, data):
"""根据分析结果生成用户友好的自然语言摘要"""
if name == "basic":
total, unique = data["total"], data["unique"]
div = unique / total if total else 0
level = '较高' if div > 0.3 else '中等' if div > 0.2 else '较低'
return (
f"📌 这段文本共包含 **{total:,} 个单词**,其中 **{unique:,} 个不重复单词**。\n"
f"词汇丰富度为 **{div:.1%}** —— **{level}**。\n"
f"📚 小知识:经典文学作品通常在 20%~40% 之间。"
)
elif name == "freq":
top10 = [w for w, _ in data["top"][:10]]
return (
f"🔤 **高频词前10名**:{' · '.join(top10)}\n"
f"这些词反映核心主题(已过滤停用词)。\n"
f"若出现人名(如 scarlett),说明聚焦人物互动。"
)
elif name == "sentiment":
if data["compound"] is None:
return data["desc"]
return (
f"🎭 **情绪基调**:{data['mood']}\n"
f"{data['desc']}\n"
f"📊 复合极性值:{data['compound']}(-1~1,越正越积极)\n"
f"🔍 细项:正面={data['detail']['pos']:.2f}, "
f"负面={data['detail']['neg']:.2f}, "
f"中性={data['detail']['neu']:.2f}"
)
elif name == "chars":
if not data:
return "🔍 **未检测到预设人物**。\n请确保包含如 Scarlett、Rhett 等名字(不区分大小写)。"
main = max(data, key=data.get)
total_mentions = sum(data.values())
return (
f"👑 **核心人物**:**{main.capitalize()}**(被提及 {data[main]} 次)\n"
f"共识别 {len(data)} 位角色,总计 {total_mentions} 次提及。\n"
f"当前段落可能围绕 {main.capitalize()} 展开情节。"
)
elif name == "relations":
pairs = [(a, b, w) for (a, b), w in data["pairs"].items() if w >= 2]
if not pairs:
return "🕸️ **未发现强人物关联**。\n建议分析更长段落(如一整章)以捕捉互动。"
a, b, w = max(pairs, key=lambda x: x[2])
return (
f"💞 **最紧密关系**:**{a.capitalize()} ↔ {b.capitalize()}**(共现 {w} 次)\n"
f"两人可能频繁互动(情侣/对手/亲属)。\n"
f"点击下方按钮查看完整关系网络图。"
)
return str(data)
# === 结果渲染器(策略模式)===
class ResultRenderer:
"""将不同分析结果的 UI 渲染逻辑解耦"""
@staticmethod
def _create_summary_card(parent, summary, is_dark):
"""创建统一风格的摘要卡片"""
bg = "#2d2d2d" if is_dark else "#f8f9fa"
fg = "white" if is_dark else "black"
card = tk.Frame(parent, bg=bg, relief="groove", bd=1)
card.pack(fill=tk.X, padx=10, pady=(10, 5))
txt = tk.Text(
card, wrap=tk.WORD, bg=bg, fg=fg, relief="flat",
height=4, font=UI_CONFIG["font_normal"]
)
txt.insert(tk.END, summary)
# 支持 **加粗** 语法
start = 0
while True:
start = txt.search("**", f"1.{start}", tk.END)
if not start: break
end = txt.search("**", f"{start}+2c", tk.END)
if not end: break
txt.delete(f"{start}", f"{start}+2c")
txt.delete(f"{end}-2c", f"{end}")
txt.tag_add("bold", f"{start}", f"{end}-2c")
start = txt.index(f"{end}-2c")
bold_font = tk.font.Font(txt, family="Segoe UI", size=10, weight="bold")
txt.tag_configure("bold", font=bold_font)
txt.config(state='disabled')
txt.pack(padx=12, pady=8, fill=tk.X)
return card
@staticmethod
def render_basic(tab, data, app):
summary = generate_summary("basic", data)
ResultRenderer._create_summary_card(tab, summary, app.is_dark)
@staticmethod
def render_freq(tab, data, app):
summary = generate_summary("freq", data)
ResultRenderer._create_summary_card(tab, summary, app.is_dark)
list_frame = ttk.Frame(tab)
list_frame.pack(fill=tk.BOTH, expand=True, padx=10)
txt = scrolledtext.ScrolledText(list_frame, width=30, font=("Consolas", 9))
for w, c in data["top"]:
txt.insert(tk.END, f"{w:<15} : {c}\n")
txt.config(state='disabled')
txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
btn = ttk.Button(list_frame, text="🖼️ 生成词云", command=lambda: app.show_wordcloud(data["counter"]))
btn.pack(side=tk.RIGHT, padx=10, pady=10)
@staticmethod
def render_sentiment(tab, data, app):
summary = generate_summary("sentiment", data)
ResultRenderer._create_summary_card(tab, summary, app.is_dark)
@staticmethod
def render_chars(tab, data, app):
summary = generate_summary("chars", data)
ResultRenderer._create_summary_card(tab, summary, app.is_dark)
if data:
txt = scrolledtext.ScrolledText(tab, height=6, font=("Consolas", 9))
for c, n in sorted(data.items(), key=lambda x: -x[1]):
txt.insert(tk.END, f"{c.capitalize():<12} : {n} 次\n")
txt.config(state='disabled')
txt.pack(fill=tk.BOTH, expand=True, padx=10)
@staticmethod
def render_relations(tab, data, app):
summary = generate_summary("relations", data)
ResultRenderer._create_summary_card(tab, summary, app.is_dark)
pairs = [(a, b, w) for (a, b), w in data["pairs"].items() if w >= 2]
if pairs:
btn = ttk.Button(tab, text="🌐 查看人物关系图", command=lambda: app.show_relations(data["pairs"]))
btn.pack(pady=10)
# === 主应用类 ===
class App:
def __init__(self, root):
self.root = root
self.is_dark = False
self.spinner = None
root.title("《乱世佳人》文本分析工具 — 专业增强版")
root.geometry("920x800")
self.setup_ui()
def setup_ui(self):
# 菜单栏
menubar = tk.Menu(self.root)
view_menu = tk.Menu(menubar, tearoff=0)
view_menu.add_command(label="切换深色/浅色主题", command=self.toggle_theme)
menubar.add_cascade(label="视图", menu=view_menu)
self.root.config(menu=menubar)
style = ttk.Style()
style.configure("TButton", font=UI_CONFIG["button_font"])
style.configure("TNotebook.Tab", padding=[12, 6])
# 控制区
ctrl = ttk.Frame(self.root)
ctrl.pack(pady=8, padx=10, fill=tk.X)
ttk.Button(ctrl, text="📂 加载文件", command=self.load_file).pack(side=tk.LEFT)
ttk.Button(ctrl, text="🧹 清空", command=self.clear).pack(side=tk.LEFT, padx=5)
self.analyze_btn = ttk.Button(ctrl, text="▶️ 分析全部", state='disabled', command=self.start_analysis)
self.analyze_btn.pack(side=tk.LEFT, padx=5)
self.export_btn = ttk.Button(ctrl, text="📤 导出报告", state='disabled', command=self.export_report)
self.export_btn.pack(side=tk.LEFT, padx=5)
self.status = ttk.Label(ctrl, text="📝 请粘贴英文小说原文或加载 .txt 文件", foreground="gray")
self.status.pack(side=tk.RIGHT)
# 输入区
ttk.Label(self.root, text="📖 英文小说原文(建议 ≥100 字):", font=UI_CONFIG["font_bold"]).pack(anchor='w', padx=10, pady=(10, 0))
self.text_area = scrolledtext.ScrolledText(self.root, height=5, font=("Consolas", 10))
self.text_area.pack(padx=10, fill=tk.BOTH)
self.text_area.bind('<KeyRelease>', self.on_text_change)
# 分隔线
ttk.Separator(self.root, orient='horizontal').pack(fill='x', padx=10, pady=5)
# 结果区
self.notebook = ttk.Notebook(self.root)
self.notebook.pack(padx=10, pady=(0, 10), fill=tk.BOTH, expand=True)
self.tabs = {}
for a in ANALYZERS:
frame = ttk.Frame(self.notebook)
self.notebook.add(frame, text=a["title"])
self.tabs[a["name"]] = frame
def toggle_theme(self):
"""切换深色/浅色主题并同步更新摘要卡片"""
self.is_dark = not self.is_dark
theme = "azure-dark" if self.is_dark else "azure"
self.root.set_theme(theme)
# 重新渲染所有标签页以应用新主题
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
if hasattr(self, 'current_hash'):
self._refresh_ui_with_theme(text)
def _refresh_ui_with_theme(self, text):
"""仅用于主题切换时刷新 UI(保留分析结果)"""
for name in ANALYZERS:
data = CACHE.get(self.current_hash, name["name"])
if data is not None:
for w in self.tabs[name["name"]].winfo_children():
w.destroy()
getattr(ResultRenderer, f"render_{name['name']}")(self.tabs[name["name"]], data, self)
def on_text_change(self, event=None):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
valid = validate_input(text)
state = 'normal' if valid else 'disabled'
self.analyze_btn.config(state=state)
self.export_btn.config(state=state)
self.status.config(
text="✅ 内容有效 · 可点击「分析全部」" if valid else
"📝 请粘贴英文小说原文或加载文件" if not raw.strip() else
"⚠️ 当前内容暂不符合分析要求",
foreground="green" if valid else "gray" if not raw.strip() else "orange"
)
def load_file(self):
path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
if not path: return
threading.Thread(target=self._load_bg, args=(path,), daemon=True).start()
def _load_bg(self, path):
try:
content = read_file_smart(path)
self.root.after(0, lambda: self.text_area.delete(1.0, tk.END) or self.text_area.insert(tk.END, content))
except Exception as e:
self.root.after(0, lambda: messagebox.showerror("错误", str(e)))
def clear(self):
self.text_area.delete(1.0, tk.END)
self.on_text_change()
def start_analysis(self):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
if not validate_input(text):
messagebox.showwarning("输入无效", "请确保输入足够长的英文文本。")
return
new_hash = CACHE.key(text)
if getattr(self, 'current_hash', None) == new_hash:
self.status.config(text="ℹ️ 内容未变,跳过重复分析", foreground="blue")
return
self.current_hash = new_hash
# 显示加载指示器
if self.spinner:
self.spinner.destroy()
self.spinner = ttk.Progressbar(self.root, mode='indeterminate')
self.spinner.pack(pady=2)
self.spinner.start()
self.analyze_btn.config(state='disabled')
threading.Thread(target=self._analyze_bg, args=(text,), daemon=True).start()
def _analyze_bg(self, text):
"""后台执行全部分析任务"""
results = {}
analysis_funcs = {
"basic": basic_stats,
"freq": word_freq,
"sentiment": sentiment,
"chars": characters,
"relations": relations
}
for name in analysis_funcs:
cached = CACHE.get(self.current_hash, name)
if cached is not None:
results[name] = cached
else:
res = analysis_funcs[name](text)
CACHE.set(self.current_hash, name, res)
results[name] = res
self.root.after(0, lambda: self._update_ui(results))
def _update_ui(self, results):
"""主线程更新 UI"""
if self.spinner:
self.spinner.stop()
self.spinner.destroy()
self.spinner = None
for name, data in results.items():
tab = self.tabs[name]
for w in tab.winfo_children():
w.destroy()
getattr(ResultRenderer, f"render_{name}")(tab, data, self)
self.status.config(text="🎉 分析完成!切换标签页查看结果", foreground="green")
self.analyze_btn.config(state='normal')
self.export_btn.config(state='normal')
def export_report(self):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
if not validate_input(text) or not hasattr(self, 'current_hash'):
messagebox.showwarning("导出失败", "请先完成有效分析。")
return
path = filedialog.asksaveasfilename(
defaultextension=".md",
filetypes=[("Markdown 文件", "*.md"), ("所有文件", "*.*")]
)
if not path: return
try:
md = f"# 《乱世佳人》文本分析报告\n\n> 基于 {len(text)} 字符的英文文本\n\n---\n"
for a in ANALYZERS:
data = CACHE.get(self.current_hash, a["name"])
if data is not None:
summary = generate_summary(a["name"], data)
summary_clean = summary.replace("**", "")
md += f"\n## {a['title']}\n\n{summary_clean}\n"
with open(path, 'w', encoding='utf-8') as f:
f.write(md)
messagebox.showinfo("导出成功", f"报告已保存至:\n{os.path.abspath(path)}")
except Exception as e:
messagebox.showerror("导出失败", f"保存时出错:\n{str(e)}")
# === 可视化方法 ===
def show_wordcloud(self, counter):
try:
wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(counter)
plt.figure(figsize=(10, 5))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title("高频词云图", fontsize=14)
plt.tight_layout()
plt.show()
except Exception as e:
messagebox.showerror("错误", f"词云生成失败:{e}")
def show_relations(self, pairs):
G = nx.Graph()
for (a, b), w in pairs.items():
if w >= 2:
G.add_edge(a.capitalize(), b.capitalize(), weight=w)
if not G.nodes():
messagebox.showinfo("提示", "无足够共现数据(需 ≥2 次)")
return
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=3, iterations=100)
nx.draw_networkx_nodes(G, pos, node_size=1200, node_color='lightcoral')
nx.draw_networkx_labels(G, pos, font_size=11, font_weight='bold')
weights = [d['weight'] for u, v, d in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=[w*1.2 for w in weights], alpha=0.6)
plt.title("人物共现关系网络(阈值:≥2 次)", fontsize=14)
plt.axis('off')
plt.tight_layout()
plt.show()
# === 启动入口 ===
if __name__ == "__main__":
root = ThemedTk(theme="azure")
App(root)
root.mainloop()
这个打分