import requests
import pandas as pd
import re
import jieba
import jieba.posseg as pseg
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import numpy as np
# ======================
# 1. 数据加载与噪音清洗
# ======================
# 微博热榜API
url = "https://weibo.com/ajax/statuses/hot_band"
cookies = {
"SCF": "Atd7tjiioKSLksVw7XSNkxi5F5cyOjh_kYA6R4iPrUaG4t9WLxx0cFOvRahXLbjQQZP6tNqrHjWe0uZO_NC2HG8.",
"SUB": "_2A25FR7uEDeRhGeFH6lUY8yvEwziIHXVmPLFMrDV8PUNbmtAbLXjHkW9Ne9sVtBXy4xW_ny2_cj73Iknd8X1Xv9zi",
"SUBP": "0033WrSXqPxfM725Ws9jqgMF55529P9D9WW4ATZM98nvQ774WfWG8RJ.5NHD95QN1K2N1Kef1hnXWs4DqcjMi--NiK.Xi-2Ri--ciKnRi-zNS0.pS0.0SKnRS5tt",
"ALF": "02_1751865557",
"XSRF-TOKEN": "s8dQ_T9a9JEpVhleuHZEAUGn",
"PC_TOKEN": "2714683c19",
"WBPSESS": "thILuXQ1w8FjHm2phr8ykyNoiO95wc6N6vU8knN1G5E8oQ1KGmrNvK-0MeRsdJJ_3ApByqSdOfDGeRNuXzxQBddj0eKTs6PmX1-T3zzaKO9-gRD5-dvW4mz_6CYt5Kf9Yfjnj5Ttet5Laz_PmXyDzQ=="
}
# 爬取微博热榜
def fetch_weibo_hot():
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"X-XSRF-TOKEN": cookies["XSRF-TOKEN"]
}
response = requests.get(url, headers=headers, cookies=cookies)
data = response.json()
hot_list = []
# 处理API响应结构变化
for item in data['data']['band_list'][:10]:
# 尝试获取热度值(字段可能已变更)
hot_value = item.get('raw_hot', item.get('num', item.get('hot_value', 0)))
# 获取话题名称
topic = item.get('note', item.get('word', '未知话题'))
hot_list.append({
"rank": item.get('rank', 0),
"topic": topic,
"hot": hot_value
})
return pd.DataFrame(hot_list)
# 数据清洗函数
def clean_text(text):
# 如果text是NaN或其他非字符串类型
if not isinstance(text, str):
return ""
# 移除URL
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# 移除@用户名
text = re.sub(r'@\w+', '', text)
# 移除话题标签
text = re.sub(r'#\w+#', '', text)
# 移除表情符号
text = re.sub(r'\[.*?\]', '', text)
# 移除特殊符号
text = re.sub(r'[【】/**…→↓↑←★☆《》<>()()【】{}「」]', '', text)
# 移除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
# 主流程
print("正在获取微博热榜数据...")
try:
# 爬取数据并保存
df = fetch_weibo_hot()
df.to_csv("weibo_hot_top10.csv", index=False, encoding='utf-8-sig')
print("微博热榜数据已保存到 weibo_hot_top10.csv")
# 读取CSV数据
df = pd.read_csv("weibo_hot_top10.csv")
# 清洗文本
print("正在清洗文本数据...")
df['cleaned_topic'] = df['topic'].apply(clean_text)
print(f"清洗后示例:\n{df['cleaned_topic'].head()}")
except Exception as e:
print(f"数据加载失败: {e}")
# 如果API失败,使用示例数据继续运行
print("使用示例数据继续运行...")
data = {
'rank': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'topic': [
'全球气候变化峰会召开',
'科技巨头发布新一代AI产品',
'国际体育赛事圆满结束',
'新电影票房破纪录',
'健康饮食新趋势',
'教育政策改革引发讨论',
'旅游热门目的地推荐',
'数字经济创新发展',
'环保行动倡议',
'文化传承与创新'
],
'hot': [5000000, 4500000, 4000000, 3800000, 3500000, 3300000, 3000000, 2800000, 2500000, 2200000]
}
df = pd.DataFrame(data)
df['cleaned_topic'] = df['topic'].apply(clean_text)
# ============================
# 2. 文本分词与停用词过滤
# ============================
# 加载停用词表
try:
with open("stopwords.txt", "r", encoding="utf-8") as f:
stopwords = set([line.strip() for line in f])
print("停用词表加载成功")
except FileNotFoundError:
print("未找到stopwords.txt,使用内置停用词")
# 基础停用词
stopwords = set(
"的 了 在 是 我 你 他 她 它 我们 你们 他们 啊 呀 吧 呢 吗 和 与 或 有 就 都 而 及 以及 等 等等".split())
# 添加额外停用词
additional_stopwords = {"微博", "热搜", "话题", "新闻", "今天", "每日", "热门", "查看", "搜索", "点击"}
stopwords |= additional_stopwords
# 初始化jieba分词器
jieba.initialize()
# 分词函数
def segment_text(text):
words = jieba.lcut(text)
return [word for word in words if word not in stopwords and len(word) > 1]
# 应用分词
print("正在进行中文分词...")
df['segmented'] = df['cleaned_topic'].apply(segment_text)
print(f"分词结果示例:\n{df['segmented'].head()}")
# ==================================
# 3. 词频与词性统计分析
# ==================================
# 合并所有分词结果
all_words = [word for sublist in df['segmented'].tolist() for word in sublist]
if not all_words:
print("分词结果为空,添加示例词汇")
all_words = ['科技', '发展', '创新', '文化', '环保', '健康', '教育', '旅游', '经济', '体育']
word_freq = Counter(all_words)
# 获取前10高频词
top_words = word_freq.most_common(10)
top_df = pd.DataFrame(top_words, columns=['Word', 'Frequency'])
# 绘制柱状图
plt.figure(figsize=(12, 6))
sns.barplot(x='Frequency', y='Word', data=top_df, palette="viridis")
plt.title('微博热榜Top10高频词汇', fontsize=16)
plt.xlabel('出现频率', fontsize=12)
plt.ylabel('关键词', fontsize=12)
plt.tight_layout()
plt.savefig('top10_words.png', dpi=300)
print("高频词汇柱状图已保存为 top10_words.png")
plt.show()
# 词性标注函数
def pos_tagging(words):
return [(word, flag) for word, flag in pseg.cut("".join(words)) if word not in stopwords]
# 词性标注
print("正在进行词性标注...")
df['pos_tags'] = df['segmented'].apply(pos_tagging)
all_pos = [flag for sublist in df['pos_tags'].tolist() for _, flag in sublist]
if not all_pos:
print("词性标注为空,添加示例数据")
all_pos = ['n', 'n', 'v', 'n', 'a', 'n', 'n', 'n', 'n', 'n']
pos_freq = Counter(all_pos)
# 常见词性说明
pos_names = {
'n': '名词', 'v': '动词', 'a': '形容词', 'd': '副词',
'm': '数词', 'q': '量词', 'r': '代词', 'p': '介词',
'c': '连词', 'u': '助词', 'xc': '其他虚词', 'w': '标点符号'
}
# =================
# 4. 生成词云图
# =================
# 生成词云文本
text_for_wordcloud = " ".join(all_words)
if not text_for_wordcloud.strip():
text_for_wordcloud = "科技 发展 创新 文化 环保 健康 教育 旅游 经济 体育"
print("正在生成词云图...")
# 创建词云对象
wc = WordCloud(
font_path='simhei.ttf', # 使用黑体,确保系统中存在该字体
background_color='white',
width=1000,
height=700,
max_words=150,
colormap='viridis',
contour_width=2,
contour_color='steelblue',
scale=2 # 提高分辨率
)
# 生成词云
wc.generate(text_for_wordcloud)
# 保存词云
plt.figure(figsize=(14, 10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title('微博热榜关键词词云', fontsize=18, pad=20)
plt.savefig('wordcloud.png', bbox_inches='tight', dpi=300)
print("词云图已保存为 wordcloud.png")
plt.show()
# =================
# 5. 附加分析
# =================
# 高频词性分布
plt.figure(figsize=(12, 7))
top_pos = pos_freq.most_common(8)
pos_data = []
for tag, count in top_pos:
name = pos_names.get(tag, f'其他({tag})')
pos_data.append((name, count))
pos_df = pd.DataFrame(pos_data, columns=['词性', '数量'])
sns.barplot(x='数量', y='词性', data=pos_df, palette="rocket")
plt.title('高频词性分布', fontsize=16)
plt.xlabel('数量', fontsize=12)
plt.ylabel('词性', fontsize=12)
plt.tight_layout()
plt.savefig('pos_distribution.png', dpi=300)
print("词性分布图已保存为 pos_distribution.png")
plt.show()
# 名词和形容词统计
nouns = sum(1 for tag in all_pos if tag == 'n')
adjectives = sum(1 for tag in all_pos if tag == 'a')
verbs = sum(1 for tag in all_pos if tag == 'v')
total_tags = len(all_pos) if len(all_pos) > 0 else 1 # 避免除以零
print("\n=== 词性分析结果 ===")
print(f"名词占比: {nouns / total_tags:.2%} ({nouns}个)")
print(f"动词占比: {verbs / total_tags:.2%} ({verbs}个)")
print(f"形容词占比: {adjectives / total_tags:.2%} ({adjectives}个)")
print(f"情感类形容词数量: {adjectives}个")
# 保存处理后的数据
df.to_csv("processed_weibo_data.csv", index=False, encoding='utf-8-sig')
print("\n数据处理完成! 结果已保存到 processed_weibo_data.csv")
以上代码:问题1柱状图中,没有字,只有方框
问题2基于高频词汇生成微博关键词词云图,词云图的形状是类似云朵的,一层一层包裹着,而且一层空一层有字。其它不变,求完完整整的代码