复现微博爬虫

最新推荐文章于 2024-04-06 02:45:18 发布

翻译最新推荐文章于 2024-04-06 02:45:18 发布 · 256 阅读

文章标签：

0

import requests
import jieba.analyse
import codecs
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import re

jieba 和词云都要额外导入

1 先爬

def get_user_info(uid):
    # 发送请求
    result = requests.get('https://m.weibo.cn/api/container/getIndex?type=uid&value={}'.format(uid))
    json_data = result.json()
    # 获取微博信息中json内容
    userinfo = {
            'name': json_data['data']['userInfo']['screen_name'],  # 获取用户头像
            'description': json_data['data']['userInfo']['description'],  # 获取用户描述
            'follow_count': json_data['data']['userInfo']['follow_count'],  # 获取关注数
            'followers_count': json_data['data']['userInfo']['followers_count'],  # 获取粉丝数
            'profile_image_url': json_data['data']['userInfo']['profile_image_url'],  # 获取头像
            'verified_reason': json_data['data']['userInfo']['verified_reason'],  # 认证信息
            'containerid': json_data['data']['tabsInfo']['tabs'][1]['containerid']  # 此字段在获取博文中需要
            }
    # 获取性别，微博中m表示男性，f表示女性
    if json_data['data']['userInfo']['gender'] == 'm':
        gender = '男'
    elif json_data['data']['userInfo']['gender'] == 'f':
        gender = '女'
    else:
        gender = '未知'
    userinfo['gender'] = gender

    return userinfo

调试时可以用来查看树形的字典更加直观

#demoDictList is the value we want format to output
jsonDumpsIndentStr = json.dumps(json_data, indent=1);
print(jsonDumpsIndentStr)

在这里插入图片描述

2 持续爬—翻页

def get_all_post(uid,containerid):
    # 从第一页开始
    page = 1
    count = 0
    # 这个用来存放博文列表
    posts = []
    while True:
        # 请求博文列表
        result = requests.get('https://m.weibo.cn/api/container/getIndex?type=uid&value={}&containerid={}&page={}'.format(uid, containerid, page))

        json_data = result.json()

        # 当博文获取完毕，退出循环

        if not json_data['data']['cards']:
            break
        if page >= 3:
            break
        # 循环将新的博文加入列表
        for i in json_data['data']['cards']:
            try:
                posts.append(i['mblog']['text'])
                count += 1
            except KeyError:
                continue
            # 停顿半秒，避免被反爬虫
            # sleep(0.5)
        #print('正在抓取第{}页，已将捕获{}条微博'.format(page, count))
        # 跳转至下一页
        page += 1
    # 返回所有博文
    return posts

第0页和第1页一样，有的没有i[‘mblog’][‘text’]，except KeyError continue

3

# 获取古力娜扎信息
#userinfo = get_user_info('1350995007')
# 信息如下
#print(userinfo)
posts = get_all_post('1350995007', '1076031350995007')
# 查看博文条数
# print(len(posts))
# 显示前3个
# print(posts[:3], sep='-----')
with codecs.open('weibo1.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(posts))

4 处理爬下来的文本

def clean_html(raw_html):
    pattern = re.compile(r'<.*?>|转发微博|//:|Repost|，|？|。|、|分享图片|回复@.*?:|//@.*')
    text = re.sub(pattern, '', raw_html)
    return text
txt = []
with codecs.open("weibo1.txt", 'r', encoding="utf-8") as f:
    for text in f.readlines():
        text = clean_html(text)
        txt.append(text)
# print(txt)
with codecs.open('after_filter_words.txt', 'w', encoding='utf-8')as f:
    f.write(''.join(txt))

txt这个list变量在两个文本中中介
写入list并且换行

5

data = []
with codecs.open("after_filter_words.txt", 'r', encoding="utf-8") as f:
    for text in f.readlines():
        print(text)
        data.extend(jieba.analyse.extract_tags(text, topK=20))
    data = " ".join(data)
    print('------------------------------------')
    print(data)
    # mask_img = imageio.imread('./52f90c9a5131c.jpg', flatten=True)
    mask_img = mpimg.imread('./52f90c9a5131c.jpg')
    plt.imshow(mask_img)
    wordcloud = WordCloud(
        font_path='msyh.ttc',
        background_color='white',
        mask=mask_img
    ).generate(data)

plt.imshow(wordcloud, interpolation="bilinear")
plt.show()

6 过程中的错误

生成词云时：ValueError: We need at least 1 word to plot a word cloud, got 0.生成的after_filter_word格式有问题，改成UTF-8编码没有效果，后来随便复制一句话进去成功，注意文本文档的编码方式
导入词云背景图：image is deprecated! toimage is deprecated in SciPy 1.0.0, and will be removed in 1.2.0. 解决：import matplotlib.image as mpimg .mpimg.imread（）来导