乱世佳人文本分析与可视化
作业描述
读取附件gone_with_the_wind.txt,基于内置模块tkinter开发一个文本分析与可视化程序,提供友好的图形用户界面。
功能需求:
1.文本输入:支持从文件读取或直接输入文本
2.基本统计:统计文本中的单词总数、不重复单词数
3.词频分析:统计各个单词出现的频率
4.生成词云图
5.显示词频统计结果
6.情感分析
7.人物识别
8.人物关系分析和可视化
import tkinter as tk
from tkinter import ttk, scrolledtext, filedialog, messagebox
import re
from collections import Counter
import threading
import hashlib
import os
# === 第三方库 ===
try:
from ttkthemes import ThemedTk
except ImportError:
raise ImportError("请安装 ttkthemes: pip install ttkthemes")
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import networkx as nx
# === 配置 ===
CHARACTERS = ["scarlett", "rhett", "melanie", "ashley", "ellen", "gerald",
"pitty", "archie", "frank", "bonnie", "suellen", "mammy"]
ANALYZERS = [
{"name": "basic", "title": "📊 基本统计"},
{"name": "freq", "title": "📈 词频分析"},
{"name": "sentiment", "title": "😊 情感分析"},
{"name": "chars", "title": "👥 人物识别"},
{"name": "relations", "title": "🔗 人物关系"},
]
# === 工具函数 ===
def clean_text(text):
text = re.sub(r'[^a-z\s]', ' ', text.lower())
tokens = [w for w in word_tokenize(text) if w not in stopwords.words('english') and len(w) > 2]
return tokens
def split_sentences(text):
return sent_tokenize(text.lower())
def validate_input(text):
if not (text := text.strip()): return False
if len(text) < 10: return False
en_chars = sum(1 for c in text if 'a' <= c.lower() <= 'z')
return en_chars >= 10 and en_chars / max(len([c for c in text if c.isalpha()]), 1) >= 0.3
def read_file_smart(path):
for enc in ['utf-8-sig', 'utf-8', 'gbk', 'latin1']:
try:
with open(path, encoding=enc) as f:
return f.read()
except: continue
raise ValueError("无法读取文件编码,请使用纯文本(.txt)文件。")
def clean_markup(text):
text = re.sub(r'<[^>]+>', ' ', text)
text = re.sub(r'\[.*?\]\(.*?\)|!\[.*?\]\(.*?\)', ' ', text)
return re.sub(r'\s+', ' ', text).strip()
# === 缓存 ===
class Cache:
def __init__(self): self._cache = {}
def key(self, text): return hashlib.md5(text.encode('utf-8', errors='ignore')).hexdigest()[:16]
def get(self, k, name): return self._cache.get((k, name))
def set(self, k, name, data): self._cache[(k, name)] = data
CACHE = Cache()
# === 分析逻辑 ===
def basic_stats(text):
words = clean_text(text)
return {"total": len(words), "unique": len(set(words))}
def word_freq(text):
words = clean_text(text)
counter = Counter(words)
return {"top": counter.most_common(30), "counter": counter}
def sentiment(text):
blob = TextBlob(text)
p, s = blob.sentiment.polarity, blob.sentiment.subjectivity
if p > 0.2: mood, desc = "积极乐观 😊", "传递希望、喜悦或赞赏。"
elif p < -0.2: mood, desc = "消极悲观 😞", "带有悲伤、愤怒或批评倾向。"
else: mood, desc = "中性平和 😐", "情绪克制,偏向叙述。"
return {"mood": mood, "desc": desc, "p": round(p,3), "s": round(s,3)}
def characters(text):
low = text.lower()
return {c: low.count(c) for c in CHARACTERS if c in low}
def relations(text):
pairs = Counter()
for s in split_sentences(text):
mentioned = [c for c in CHARACTERS if c in s]
for i in range(len(mentioned)):
for j in range(i+1, len(mentioned)):
pair = tuple(sorted([mentioned[i], mentioned[j]]))
pairs[pair] += 1
return {"pairs": pairs}
# === 摘要生成器 ===
def generate_summary(name, data):
if name == "basic":
total, unique = data["total"], data["unique"]
div = unique / total if total else 0
level = '较高' if div > 0.3 else '中等' if div > 0.2 else '较低'
return (
f"📌 这段文本共包含 **{total:,} 个单词**,其中 **{unique:,} 个不重复单词**。\n"
f"词汇丰富度为 **{div:.1%}** —— **{level}**。\n"
f"📚 小知识:经典文学作品通常在 20%~40% 之间。"
)
elif name == "freq":
top10 = [w for w, _ in data["top"][:10]]
return (
f"🔤 **高频词前10名**:{' · '.join(top10)}\n"
f"这些词反映核心主题(已过滤停用词)。\n"
f"若出现人名(如 scarlett),说明聚焦人物互动。"
)
elif name == "sentiment":
return (
f"🎭 **情绪基调**:{data['mood']}\n"
f"{data['desc']}\n"
f"📊 极性值:{data['p']}(-1~1,越正越积极)\n"
f"💭 主观性:{data['s']}(0~1,越高越主观)"
)
elif name == "chars":
if not data:
return "🔍 **未检测到预设人物**。\n请确保包含如 Scarlett、Rhett 等名字(不区分大小写)。"
main = max(data, key=data.get)
total_mentions = sum(data.values())
return (
f"👑 **核心人物**:**{main.capitalize()}**(被提及 {data[main]} 次)\n"
f"共识别 {len(data)} 位角色,总计 {total_mentions} 次提及。\n"
f"当前段落可能围绕 {main.capitalize()} 展开情节。"
)
elif name == "relations":
pairs = [(a,b,w) for (a,b),w in data["pairs"].items() if w >= 2]
if not pairs:
return "🕸️ **未发现强人物关联**。\n建议分析更长段落(如一整章)以捕捉互动。"
a, b, w = max(pairs, key=lambda x: x[2])
return (
f"💞 **最紧密关系**:**{a.capitalize()} ↔ {b.capitalize()}**(共现 {w} 次)\n"
f"两人可能频繁互动(情侣/对手/亲属)。\n"
f"点击下方按钮查看完整关系网络图。"
)
return str(data)
# === 渲染函数 ===
def render_result(tab, name, data, app):
for w in tab.winfo_children(): w.destroy()
# 摘要卡片
card = tk.Frame(tab, bg="#f8f9fa", relief="groove", bd=1)
card.pack(fill=tk.X, padx=10, pady=(10,5))
summary = generate_summary(name, data)
txt = tk.Text(card, wrap=tk.WORD, bg="#f8f9fa", relief="flat", height=4, font=("Segoe UI", 10))
txt.insert(tk.END, summary)
# 自动加粗 **...**
start = 0
while True:
start = txt.search("**", f"1.{start}", tk.END)
if not start: break
end = txt.search("**", f"{start}+2c", tk.END)
if not end: break
txt.delete(f"{start}", f"{start}+2c")
txt.delete(f"{end}-2c", f"{end}")
txt.tag_add("bold", f"{start}", f"{end}-2c")
start = txt.index(f"{end}-2c")
bold_font = tk.font.Font(txt, weight="bold")
txt.tag_configure("bold", font=bold_font)
txt.config(state='disabled')
txt.pack(padx=12, pady=8, fill=tk.X)
# 内容区
if name == "freq":
list_frame = ttk.Frame(tab); list_frame.pack(fill=tk.BOTH, expand=True, padx=10)
txt = scrolledtext.ScrolledText(list_frame, width=30, font=("Consolas", 9))
for w, c in data["top"]: txt.insert(tk.END, f"{w:<15} : {c}\n")
txt.config(state='disabled'); txt.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
btn = ttk.Button(list_frame, text="🖼️ 生成词云", command=lambda: app.show_wordcloud(data["counter"]))
btn.pack(side=tk.RIGHT, padx=10, pady=10)
elif name == "chars" and data:
txt = scrolledtext.ScrolledText(tab, height=6, font=("Consolas", 9))
for c, n in sorted(data.items(), key=lambda x: -x[1]):
txt.insert(tk.END, f"{c.capitalize():<12} : {n} 次\n")
txt.config(state='disabled'); txt.pack(fill=tk.BOTH, expand=True, padx=10)
elif name == "relations":
pairs = [(a,b,w) for (a,b),w in data["pairs"].items() if w >= 2]
if pairs:
btn = ttk.Button(tab, text="🌐 查看人物关系图", command=lambda: app.show_relations(data["pairs"]))
btn.pack(pady=10)
# === 主应用 ===
class App:
def __init__(self, root):
self.root = root
self.is_dark = False
root.title("《乱世佳人》文本分析工具 — 最终增强版")
root.geometry("900x780")
self.setup_ui()
def setup_ui(self):
# 菜单栏
menubar = tk.Menu(self.root)
view_menu = tk.Menu(menubar, tearoff=0)
view_menu.add_command(label="切换深色/浅色主题", command=self.toggle_theme)
menubar.add_cascade(label="视图", menu=view_menu)
self.root.config(menu=menubar)
style = ttk.Style()
style.configure("TButton", font=("Segoe UI", 9))
style.configure("TNotebook.Tab", padding=[12, 6])
# 控制区
ctrl = ttk.Frame(self.root); ctrl.pack(pady=8, padx=10, fill=tk.X)
ttk.Button(ctrl, text="📂 加载文件", command=self.load_file).pack(side=tk.LEFT)
ttk.Button(ctrl, text="🧹 清空", command=self.clear).pack(side=tk.LEFT, padx=5)
self.analyze_btn = ttk.Button(ctrl, text="▶️ 分析全部", state='disabled', command=self.start_analysis)
self.analyze_btn.pack(side=tk.LEFT, padx=5)
self.export_btn = ttk.Button(ctrl, text="📤 导出报告", state='disabled', command=self.export_report)
self.export_btn.pack(side=tk.LEFT, padx=5)
self.status = ttk.Label(ctrl, text="📝 请粘贴英文小说原文或加载 .txt 文件", foreground="gray")
self.status.pack(side=tk.RIGHT)
# 输入区
ttk.Label(self.root, text="📖 英文小说原文(建议 ≥100 字):").pack(anchor='w', padx=10)
self.text_area = scrolledtext.ScrolledText(self.root, height=5, font=("Consolas", 10))
self.text_area.pack(padx=10, fill=tk.BOTH)
self.text_area.bind('<KeyRelease>', self.on_text_change)
# 结果区
self.notebook = ttk.Notebook(self.root)
self.notebook.pack(padx=10, pady=5, fill=tk.BOTH, expand=True)
self.tabs = {}
for a in ANALYZERS:
frame = ttk.Frame(self.notebook)
self.notebook.add(frame, text=a["title"])
self.tabs[a["name"]] = frame
def toggle_theme(self):
self.is_dark = not self.is_dark
theme = "azure-dark" if self.is_dark else "azure"
self.root.set_theme(theme)
# 更新摘要背景色
bg = "#2d2d2d" if self.is_dark else "#f8f9fa"
fg = "white" if self.is_dark else "black"
for tab in self.tabs.values():
for child in tab.winfo_children():
if isinstance(child, tk.Frame) and child.cget("relief") == "groove":
child.config(bg=bg)
for widget in child.winfo_children():
if isinstance(widget, tk.Text):
widget.config(bg=bg, fg=fg)
def on_text_change(self, event=None):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
valid = validate_input(text)
state = 'normal' if valid else 'disabled'
self.analyze_btn.config(state=state)
self.export_btn.config(state=state)
self.status.config(
text="✅ 内容有效 · 可点击「分析全部」" if valid else
"📝 请粘贴英文小说原文或加载文件" if not raw.strip() else
"⚠️ 当前内容暂不符合分析要求",
foreground="green" if valid else "gray" if not raw.strip() else "orange"
)
def load_file(self):
path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
if not path: return
threading.Thread(target=self._load_bg, args=(path,), daemon=True).start()
def _load_bg(self, path):
try:
content = read_file_smart(path)
self.root.after(0, lambda: self.text_area.delete(1.0, tk.END) or self.text_area.insert(tk.END, content))
except Exception as e:
self.root.after(0, lambda: messagebox.showerror("错误", str(e)))
def clear(self):
self.text_area.delete(1.0, tk.END)
self.on_text_change()
def start_analysis(self):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
if not validate_input(text):
messagebox.showwarning("输入无效", "请确保输入足够长的英文文本。")
return
new_hash = CACHE.key(text)
if getattr(self, 'current_hash', None) == new_hash:
self.status.config(text="ℹ️ 内容未变,跳过重复分析", foreground="blue")
return
self.current_hash = new_hash
self.status.config(text="⏳ 分析中...(界面仍可操作)", foreground="blue")
self.analyze_btn.config(state='disabled')
threading.Thread(target=self._analyze_bg, args=(text,), daemon=True).start()
def _analyze_bg(self, text):
results = {}
for a in ANALYZERS:
func_map = {
"basic": basic_stats,
"freq": word_freq,
"sentiment": sentiment,
"chars": characters,
"relations": relations
}
cached = CACHE.get(self.current_hash, a["name"])
if cached is not None:
results[a["name"]] = cached
else:
res = func_map[a["name"]](text)
CACHE.set(self.current_hash, a["name"], res)
results[a["name"]] = res
self.root.after(0, lambda: self._update_ui(results))
def _update_ui(self, results):
for name, data in results.items():
render_result(self.tabs[name], name, data, self)
self.status.config(text="🎉 分析完成!切换标签页查看结果", foreground="green")
self.analyze_btn.config(state='normal')
self.export_btn.config(state='normal')
def export_report(self):
raw = self.text_area.get(1.0, tk.END)
text = clean_markup(raw)
if not validate_input(text):
messagebox.showwarning("导出失败", "请先输入有效文本并完成分析。")
return
path = filedialog.asksaveasfilename(
defaultextension=".md",
filetypes=[("Markdown 文件", "*.md"), ("所有文件", "*.*")]
)
if not path: return
try:
md = f"# 《乱世佳人》文本分析报告\n\n> 基于 {len(text)} 字符的英文文本\n\n---\n"
for a in ANALYZERS:
data = CACHE.get(self.current_hash, a["name"])
if data is not None:
summary = generate_summary(a["name"], data)
# 移除 ** 以兼容纯文本
summary_clean = summary.replace("**", "")
md += f"\n## {a['title']}\n\n{summary_clean}\n"
with open(path, 'w', encoding='utf-8') as f:
f.write(md)
messagebox.showinfo("导出成功", f"报告已保存至:\n{os.path.abspath(path)}")
except Exception as e:
messagebox.showerror("导出失败", f"保存时出错:\n{str(e)}")
# === 可视化方法 ===
def show_wordcloud(self, counter):
try:
wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(counter)
plt.figure(figsize=(10, 5)); plt.imshow(wc, interpolation='bilinear'); plt.axis("off"); plt.title("高频词云图"); plt.show()
except Exception as e:
messagebox.showerror("错误", f"词云生成失败:{e}")
def show_relations(self, pairs):
G = nx.Graph()
for (a, b), w in pairs.items():
if w >= 2:
G.add_edge(a.capitalize(), b.capitalize(), weight=w)
if not G.nodes():
messagebox.showinfo("提示", "无足够共现数据(需 ≥2 次)")
return
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, k=3, iterations=100)
nx.draw_networkx_nodes(G, pos, node_size=1200, node_color='lightcoral')
nx.draw_networkx_labels(G, pos, font_size=11, font_weight='bold')
weights = [d['weight'] for u, v, d in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=[w*1.2 for w in weights], alpha=0.6)
plt.title("人物共现关系网络(阈值:≥2 次)"); plt.axis('off'); plt.tight_layout(); plt.show()
# === 启动 ===
if __name__ == "__main__":
root = ThemedTk(theme="azure")
App(root)
root.mainloop()
评价一下这个代码
最新发布