import requests
from bs4 import BeautifulSoup
import re
import jieba
import matplotlib.pyplot as plt
import networkx as nx
import csv
import matplotlib.colors as mcolors
import os
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def fetch_html(url):
"""获取指定URL的HTML内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
else:
raise Exception(f"Failed to fetch page, status code: {response.status_code}")
def extract_plot(url):
"""从豆瓣剧情页面提取剧情内容"""
html_content = fetch_html(url)
soup = BeautifulSoup(html_content, 'html.parser')
# 获取剧情简介
plot_text = ""
plot_div = soup.find('div', id="link-report")
if plot_div:
plot_text = plot_div.get_text(strip=True).replace("\u3000", "").replace("\\n", "\n")
# 如果剧情太短,尝试从评论中获取
if len(plot_text) < 500:
reviews = soup.find_all('div', class_='review-short')
for review in reviews[:5]:
plot_text += review.get_text(strip=True).replace("\u3000", "") + "\n\n"
return plot_text
def extract_characters(url):
"""从豆瓣演职员页面提取角色列表"""
html_content = fetch_html(url)
soup = BeautifulSoup(html_content, 'html.parser')
# 提取角色列表
characters = []
actor_list = soup.find_all('div', class_='list-wrapper')
for actor_section in actor_list:
for li in actor_section.find_all('li'):
# 提取角色名
role_span = li.find('span', class_='role')
if role_span:
role_text = role_span.get_text(strip=True)
# 清洗角色名称
role_text = re.sub(r'[饰演].*|[饰演].*', '', role_text)
role_text = re.sub(r'[\[\(].*?[\]\)]', '', role_text)
if len(role_text) > 1 and role_text not in characters:
characters.append(role_text)
return characters
def preprocess_plot(plot_text):
"""预处理剧情文本"""
# 替换剧集中提到的剧名,避免被识别为角色
plot_text = plot_text.replace("芈月", "主角").replace("芈月传", "本剧")
return plot_text
# 豆瓣电影《芈月传》的相关URL
plot_url = "https://movie.douban.com/subject/25742289/" # 剧情简介页面
character_url = "https://movie.douban.com/subject/25742289/celebrities" # 演职员页面
# 1. 获取剧情内容
print("获取剧情内容...")
plot_text = extract_plot(plot_url)
if not plot_text:
# 从备份文件中获取(如果网络请求失败)
if os.path.exists("芈月传剧情备份.txt"):
with open("芈月传剧情备份.txt", "r", encoding="utf-8") as f:
plot_text = f.read()
else:
# 保存备份
with open("芈月传剧情备份.txt", "w", encoding="utf-8") as f:
f.write(plot_text)
# 预处理剧情文本
plot_text = preprocess_plot(plot_text)
with open('芈月传剧情.txt', 'w', encoding='utf-8') as f:
f.write(plot_text)
# 2. 获取角色列表
print("获取角色列表...")
characters = extract_characters(character_url)
if not characters:
# 使用预设角色列表(如果网络请求失败)
characters = ['芈月', '芈姝', '嬴驷', '黄歇', '翟骊', '魏琰', '嬴稷', '芈槐',
'芈茵', '芈戎', '孟嬴', '香儿', '葵姑', '威后', '莒姬', '葵姑',
'穆监', '绿萝', '赢华', '樗里子', '芈姝母', '芈月母', '张仪',
'苏秦', '庸芮', '魏冉', '芈戎']
# 保存角色列表
with open('芈月传角色名单.txt', 'w', encoding='utf-8') as f:
f.write('\n'.join(characters))
print(f"角色列表: {', '.join(characters)}")
print(f"剧情文本长度: {len(plot_text)} 字符")
# 3. 为分词添加角色名
for name in characters:
jieba.add_word(name)
# 对于长名字添加分词组合
if len(name) == 3:
jieba.add_word(name[:2])
jieba.add_word(name[1:])
elif len(name) >= 4:
jieba.add_word(name[:2])
jieba.add_word(name[2:])
jieba.add_word(name[1:3])
# 4. 人物出场次数统计
print("统计人物出场次数...")
with open('芈月传剧情.txt', 'r', encoding='utf-8') as f:
plot_summary = f.read()
# 分割句子
sentences = re.split(r'[。!?;\n]', plot_summary)
sentences = [s for s in sentences if len(s.strip()) > 5]
# 统计各角色出场次数
name_counts = {name: 0 for name in characters}
for sentence in sentences:
words = jieba.cut(sentence)
for word in words:
if word in name_counts:
name_counts[word] += 1
# 过滤没有出场或出场次数过低的角色
active_characters = [name for name, count in name_counts.items() if count > 0]
print(f"有效角色数量: {len(active_characters)}")
# 按出场次数排序
sorted_name_counts = sorted(name_counts.items(), key=lambda item: item[1], reverse=True)
top_characters = [item[0] for item in sorted_name_counts[:min(15, len(sorted_name_counts))] if item[1] > 0]
top_counts = [item[1] for item in sorted_name_counts[:min(15, len(sorted_name_counts))] if item[1] > 0]
# 绘制出场次数统计图
plt.figure(figsize=(14, 8))
plt.bar(top_characters, top_counts, color='#1f77b4', alpha=0.8)
plt.xlabel('角色名称', fontsize=12, fontweight='bold')
plt.ylabel('出场次数', fontsize=12, fontweight='bold')
plt.title('《芈月传》人物出场次数统计TOP15', fontsize=16, fontweight='bold')
plt.xticks(rotation=25)
plt.grid(axis='y', alpha=0.5)
plt.tight_layout()
plt.savefig('芈月传人物出场次数统计.png', dpi=150)
plt.close()
# 5. 构建人物关系网络
print("构建人物关系网络...")
G = nx.Graph()
# 添加节点(只添加活跃角色)
for name in active_characters:
G.add_node(name, size=name_counts[name])
# 预设关系增强(基于剧情分析)
enhanced_relationships = {
('芈月', '芈姝'): -5, # 敌对关系
('芈月', '黄歇'): 8, # 爱人
('芈月', '翟骊'): 7, # 爱人
('芈月', '嬴驷'): 6, # 君臣/夫妻
('芈月', '嬴稷'): 9, # 母子
('芈姝', '嬴驷'): 6, # 夫妻
('芈姝', '魏琰'): -3, # 亦敌亦友
('黄歇', '翟骊'): -4, # 情敌
('嬴驷', '魏琰'): 4, # 君臣
('芈月', '芈茵'): -5, # 敌对
('芈月', '威后'): -6, # 敌对
('芈月', '张仪'): 7, # 盟友
}
# 统计句子中共现情况
sentence_cooccurrence = {}
for sentence in sentences:
# 找出当前句子中出现的角色
current_chars = set()
words = jieba.cut(sentence)
for word in words:
if word in active_characters:
current_chars.add(word)
# 为当前句子中出现的人物建立关系
char_list = list(current_chars)
for i in range(len(char_list)):
for j in range(i+1, len(char_list)):
char1, char2 = sorted([char_list[i], char_list[j]])
pair = (char1, char2)
# 累加关系权重
sentence_cooccurrence[pair] = sentence_cooccurrence.get(pair, 0) + 1
# 将关系添加到图中
for pair, count in sentence_cooccurrence.items():
char1, char2 = pair
weight = count
# 检查是否在增强字典中
if pair in enhanced_relationships:
sign = 1 if enhanced_relationships[pair] > 0 else -1
weight = max(2, count) * (abs(enhanced_relationships[pair]) / 5)
weight *= sign
# 检查反向关系对
elif (char2, char1) in enhanced_relationships:
sign = 1 if enhanced_relationships[(char2, char1)] > 0 else -1
weight = max(2, count) * (abs(enhanced_relationships[(char2, char1)]) / 5)
weight *= sign
else:
# 默认处理:少于2次共现关系不强
if count <= 1:
continue
# 添加或更新关系
if G.has_edge(char1, char2):
current_weight = G[char1][char2].get('weight', 0)
G[char1][char2]['weight'] = current_weight + weight
else:
G.add_edge(char1, char2, weight=weight)
# 6. 导出CSV文件
with open('芈月传人物关系_节点.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['Id', 'Label', 'Size'])
for node, data in G.nodes(data=True):
writer.writerow([node, node, data.get('size', 1)])
with open('芈月传人物关系_边.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['Source', 'Target', 'Weight', 'Type'])
for u, v, data in G.edges(data=True):
edge_type = 'Positive' if data['weight'] > 0 else 'Negative'
writer.writerow([u, v, abs(data['weight']), edge_type])
# 7. 可视化人物关系图
if G.number_of_edges() > 0:
plt.figure(figsize=(20, 16))
# 节点大小基于出场次数
node_sizes = [data.get('size', 1) * 100 for _, data in G.nodes(data=True)]
# 边的宽度和颜色基于关系强度
edge_widths = []
edge_colors = []
for u, v, data in G.edges(data=True):
weight = abs(data['weight'])
if data['weight'] > 0:
# 正向关系 - 红色
edge_colors.append('red')
else:
# 负向关系 - 蓝色
edge_colors.append('blue')
edge_widths.append(min(weight, 5))
# 使用布局算法
pos = nx.spring_layout(G, k=0.3, iterations=50, seed=42)
# 绘制图形
nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='#ffcc99', alpha=0.9)
nx.draw_networkx_edges(G, pos, edge_color=edge_colors, width=edge_widths, alpha=0.7)
nx.draw_networkx_labels(G, pos, font_size=12, font_family='SimHei')
# 添加图例
plt.scatter([], [], c='#ffcc99', s=100, label='人物节点')
plt.plot([], [], c='red', linewidth=3, label='正向关系')
plt.plot([], [], c='blue', linewidth=3, label='负向关系')
plt.legend(loc='best', fontsize=12)
plt.title('《芈月传》人物关系网络', fontsize=20)
plt.axis('off')
plt.tight_layout()
plt.savefig('芈月传人物关系图.png', dpi=150)
plt.close()
print("分析完成!已保存关系图和统计数据文件。")
以上述代码为模版,爬取https://www.tvmao.com/drama/KSExaik=/episode 电视猫中的数据,做人物关系图