import urllib.request
from bs4 import BeautifulSoup
import re
import jieba
import matplotlib.pyplot as plt
import networkx as nx
import csv
import jieba.posseg as pseg
import matplotlib.colors as mcolors
添加用户代理头避免被拦截
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36’
}
从百度百科获取《琅琊榜》数据
url = “https://baike.baidu.com/item/琅瓊榜/15474822”
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html_content = response.read().decode(‘utf-8’)
soup = BeautifulSoup(html_content, ‘html.parser’)
提取剧情内容
all_jq = “”
主要剧情
main_plot = soup.find(‘div’, class_=‘lemma-summary’)
if main_plot:
paragraphs = main_plot.find_all(‘div’, class_=‘para’)
for para in paragraphs:
content_text = para.get_text(strip=True)
content_text = re.sub(r’\s*[\d+]', ‘’, content_text).strip()
all_jq += content_text + “\n”
分集剧情
episode_section = None
for h2 in soup.find_all(‘h2’):
if “分集剧情” in h2.get_text().strip():
episode_section = h2
break
if episode_section:
table = episode_section.find_next(‘table’, class_=‘table-view’)
if table:
rows = table.find_all(‘tr’)
for row in rows:
if row.th: # 跳过表头
continue
tds = row.find_all(‘td’)
if len(tds) >= 2:
episode_text = tds[1].get_text(strip=True)
清理文本
episode_text = re.sub(r’[编辑]|\s{2,}', ‘’, episode_text)
if episode_text:
all_jq += episode_text + “\n”
保存剧情
with open(‘琅琊榜剧情.txt’, ‘w’, encoding=‘utf-8’) as f:
f.write(all_jq)
提取主要角色 names = [] characters_section = None for h2 in soup.find_all(‘h2’): if “角色介绍” in h2.get_text().strip() or “主要角色” in h2.get_text().strip(): characters_section = h2 break
if characters_section:
查找角色表格
table = characters_section.find_next(‘table’, class_=‘table-view’)
if table:
rows = table.find_all(‘tr’)
for row in rows:
第一行为表头,跳过
if row.th:
continue
第一列为角色名称
first_td = row.find(‘td’)
if first_td:
移除括号内的演员名(如"梅长苏(胡歌饰)")
name_text = re.sub(r’(.)|(.)‘, ‘’, first_td.get_text(strip=True))
只保留中文名
if re.match(r’^[\u4e00-\u9fa5·]{2,4}$', name_text):
names.append(name_text)
手动添加主要角色(确保核心角色都在列表中)
core_characters = [‘梅长苏’, ‘萧景琰’, ‘霓凰郡主’, ‘蒙挚’, ‘誉王’, ‘谢玉’, ‘言豫津’, ‘萧景睿’]
for char in core_characters:
if char not in names:
names.append(char)
去重并排序
names = sorted(set(names))
print(f"提取的角色列表: {', '.join(names)}")
保存角色列表
with open(‘琅琊榜角色名单.txt’, ‘w’, encoding=‘utf-8’) as f:
f.write(‘\n’.join(names))
为分词添加角色名 for name in names: jieba.add_word(name)
对于3-4字的角色名,添加可能的分词组合
if len(name) == 3:
jieba.add_word(name[:2])
jieba.add_word(name[1:])
elif len(name) == 4:
jieba.add_word(name[:2])
jieba.add_word(name[2:])
jieba.add_word(name[1:3])
人物出场次数统计 with open(‘琅琊榜剧情.txt’, ‘r’, encoding=‘utf-8’) as f: plot_summary = f.read()
分割句子
sentences = [s for s in re.split(r’[。!?:]', plot_summary) if s.strip()]
统计各角色出场次数
name_counts = {name: 0 for name in names}
for sentence in sentences:
使用jieba分词
words = jieba.cut(sentence)
for word in words:
if word in name_counts:
name_counts[word] += 1
按出场次数排序
sorted_name_counts = sorted(name_counts.items(), key=lambda item: item[1], reverse=True)
top_ten = sorted_name_counts[:10]
绘制统计图
plt.figure(figsize=(12, 6))
top_names, top_counts = zip(*top_ten)
plt.bar(top_names, top_counts, color=‘#1f77b4’)
plt.xlabel(‘角色名称’, fontsize=12)
plt.ylabel(‘出场次数’, fontsize=12)
plt.title(‘《琅琊榜》主要角色出场统计TOP10’, fontsize=14)
plt.xticks(rotation=15)
plt.tight_layout()
plt.savefig(‘琅琊榜人物出场次数统计.png’, dpi=150)
plt.show()
构建人物关系网络 G = nx.Graph()
添加节点
for name, count in name_counts.items():
if count > 0: # 只添加实际出场的角色
G.add_node(name, size=count)
关系增强字典(已知的强关系对)
enhanced_relationships = {
正面关系
(‘梅长苏’, ‘萧景琰’): 10, # 挚友
(‘梅长苏’, ‘霓凰郡主’): 10, # 情侣
(‘梅长苏’, ‘蒙挚’): 8, # 部下
(‘萧景琰’, ‘霓凰郡主’): 7, # 兄妹
负面关系
(‘梅长苏’, ‘誉王’): -8, # 政敌
(‘萧景琰’, ‘誉王’): -8, # 政敌
(‘梅长苏’, ‘谢玉’): -10, # 仇敌
}
分析人物关系:同一句子中出现算一次关联
sentence_cooccurrence = {}
for sentence in sentences:
找出当前句子中出现的角色
current_chars = set()
words = jieba.cut(sentence)
for word in words:
if word in names:
current_chars.add(word)
为当前句子中出现的人物建立关系
char_list = list(current_chars)
for i in range(len(char_list)):
for j in range(i+1, len(char_list)):
char1 = char_list[i]
char2 = char_list[j]
pair = tuple(sorted([char1, char2]))
# 累加关系权重
if pair in sentence_cooccurrence:
sentence_cooccurrence[pair] += 1
else:
sentence_cooccurrence[pair] = 1
将关系添加到图中
for pair, count in sentence_cooccurrence.items():
weight = count
检查是否在增强字典中
if pair in enhanced_relationships:
如果是正权重就增强,负权重就减弱
sign = 1 if enhanced_relationships[pair] > 0 else -1
weight = max(2, count) * (abs(enhanced_relationships[pair])/5)
weight *= sign
elif (pair[1], pair[0]) in enhanced_relationships:
sign = 1 if enhanced_relationships[(pair[1], pair[0])] > 0 else -1
weight = max(2, count) * (abs(enhanced_relationships[(pair[1], pair[0])])/5)
weight *= sign
G.add_edge(pair[0], pair[1], weight=weight)
7. 导出CSV文件(用于Gephi或其他可视化工具)
with open(‘琅琊榜人物关系_节点.csv’, ‘w’, newline=‘’, encoding=‘utf-8-sig’) as f:
writer = csv.writer(f)
writer.writerow([‘Id’, ‘Label’, ‘Size’])
for node, data in G.nodes(data=True):
writer.writerow([node, node, data.get(‘size’, 1)])
with open(‘琅琊榜人物关系_边.csv’, ‘w’, newline=‘’, encoding=‘utf-8-sig’) as f:
writer = csv.writer(f)
writer.writerow([‘Source’, ‘Target’, ‘Weight’, ‘Type’])
for u, v, data in G.edges(data=True):
edge_type = ‘Positive’ if data[‘weight’] > 0 else ‘Negative’
writer.writerow([u, v, abs(data[‘weight’]), edge_type])
可视化人物关系图 plt.figure(figsize=(20, 16))
创建颜色映射 - 基于关系权重
weights = [abs(data[‘weight’]) for _, _, data in G.edges(data=True)]
norm = mcolors.Normalize(vmin=min(weights), vmax=max(weights))
cmap = plt.cm.coolwarm # 暖色为正向关系,冷色为负向关系
节点大小基于出场次数
node_sizes = [data[‘size’]*50 for _, data in G.nodes(data=True)]
边的颜色和宽度基于关系强度
edge_colors = []
edge_widths = []
for u, v, data in G.edges(data=True):
if data[‘weight’] > 0:
正向关系 - 红色系
edge_colors.append(cmap(1 - norm(data[‘weight’]))) # 浅红到深红
else:
负向关系 - 蓝色系
edge_colors.append(cmap(norm(abs(data[‘weight’])))) # 浅蓝到深蓝
edge_widths.append(norm(abs(data[‘weight’])) * 2 + 0.5)
使用spring_layout算法布局
pos = nx.spring_layout(G, k=0.3, iterations=50, seed=42)
绘制图形
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=‘#ff9999’, alpha=0.9)
nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=edge_widths, alpha=0.7)
nx.draw_networkx_labels(G, pos, font_size=12, font_family=‘SimHei’)
添加图例(说明)
plt.scatter([], [], c=‘#ff9999’, s=100, label=‘人物节点’)
plt.plot([], [], c=‘red’, linewidth=2, label=‘正向关系’)
plt.plot([], [], c=‘blue’, linewidth=2, label=‘负向关系’)
plt.legend(loc=‘best’)
plt.title(‘《琅琊榜》主要人物关系网络’, fontsize=20)
plt.axis(‘off’)
plt.tight_layout()
plt.savefig(‘琅琊榜人物关系图.png’, dpi=150)
plt.show()
print(“分析完成!已保存关系图和数据文件。”)
将这个案例改变成网址https://baike.baidu.com/item/琅琊榜/12700172 使得生成的图片有数据