爬虫系列之新浪微博

最新推荐文章于 2024-04-16 15:26:40 发布

原创最新推荐文章于 2024-04-16 15:26:40 发布 · 756 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#python #新浪微博 #爬虫 #pyecharts

python学习同时被 2 个专栏收录

16 篇文章

订阅专栏

爬虫

3 篇文章

订阅专栏

提供代码运行过程的相关咨询服务，确保代码成功运行。

本文代码已同步更新到，下面公众号。请大家关注公众号【 Python人生之技术实践】，近期将推出豆瓣、知乎、微博、网易云音乐、拉勾网、知网的爬虫教程，代码已经完成，在整理阶段谢谢。

爬虫方式：

1 爬取某个话题的所有的评论

2 爬取某个博主的所有微博的评论

3 爬取某个博主的所有粉丝的用户信息，位置信息

微博评论爬虫难点：

获取max_id 和 max_id_type 参数

以“迪丽热巴的微博”为案例，爬取热巴的所有微博，所有微博评论，所有粉丝的用户信息，并对所有的粉丝进行位置分析，在地图上显示粉丝分布。

1 requests_weibo.py功能：爬取迪丽热巴的所有的微博，微博内容的text保存在excel中，包括微博id

2 requests_weibo_comment.py 功能：读取热巴所有的微博id，爬取某个微博的所有评论内容，粉丝id

3 user_infro.py 功能：读取所有粉丝id, 爬取微博用户（热巴粉丝）的基本信息（生日，位置，性别）。粉丝信息接口在代码中。

4 map_weibo_fensi.py: 对粉丝的性别和位置信息进行可视化

注意事项：

新浪微博评论内容获取时，url参数第1页，第2页至第15页，第16页及以后页码，参数是不同的。

第1页：https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016

第2 -15 页：max_id_type 的数值为 0

https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id=1290444946018013&max_id_type=0

第16页及以后：

从第16页开始max_id_type 的数值为 1，这里最后是通过请求获取该值，不要手动修改为1。

requests_weibo.py

# time  : 2020/6/1 11:37
# author :wujiajia
# email :wujiajia666@qq.com
# file  : requests_weibo.py
# Software: PyCharm
# python_version:  3.6
# funcation:  获得迪丽热巴的所有微博及粉丝评论的详细url
import requests
import time
# ************************
time_ = []  # 微博发布时间
comment_url = []  # 某个微博的详细内容url
text = [] #微博文字内容
comment_num = [] #评论数
zan_num = []  # 点赞数
id = []  # 每个微博的唯一编号， 需要id 构造粉丝评论url  https://m.weibo.cn/comments/hotflow?id=4510598555019129&mid=4510598555019129&max_id_type=0

def spider(a):  # 参数a 表示微博的第几页
    url = "https://m.weibo.cn/api/container/getIndex"  # m. 表示手机登入浏览
    # 请求头要改为 手机或者ipad模式
    headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'}
    data = {  # 这个data  就是 xhr 中 的查询参数
        "uid": "1669879400",
        "t": "0",
        "luicode": "10000011",
        "lfid": "100103type=1&q=迪丽热巴",
        "type": "uid",
        "value": "1669879400",
        "containerid": "1076031669879400",
        "page": str(a)
    }
    data_html = requests.get(url = url,params=data,headers=headers).json(encoding ="utf-8")  #获取json数据
    # print(data_html)
    for num in  range(1,len(data_html["data"]["cards"])): # len(data_html["data"]["cards"]) 表示每个分页有多少条数据
        if data_html["data"]["cards"][num]["card_type"] == 9:
            # print(data_html["data"]["cards"][num]["mblog"]["text"])
            time_.append(data_html["data"]["cards"][num]["mblog"]["created_at"])
            text.append(data_html["data"]["cards"][num]["mblog"]["text"])
            id.append(data_html["data"]["cards"][num]["mblog"]["id"])
            comment_url.append(data_html["data"]["cards"][num]["scheme"])
            zan_num.append(data_html["data"]["cards"][num]["mblog"]["attitudes_count"])
            if data_html["data"]["cards"][num]["mblog"]["comments_count"] == "100万+": #将100万+ 转换为数字
                comment_num.append(1000000)
            else:
                comment_num.append(data_html["data"]["cards"][num]["mblog"]["comments_count"])

# **************调用spider，爬取多个页面
def run_spider():
# spider(1)
    for i in range(0,144):  # 144 表示一共有多少条微博
        print("--正在抓取第【{}】页--".format(i))
        time.sleep(5)
        spider(i)

# print(list(zip(time_,text,comment_url)))
# ***************保存数据
def save_data():
    V = list(zip(id, time_,text,comment_url,zan_num,comment_num))
    import pandas as pd
    df = pd.DataFrame(V,columns=["id","time","text","comment_url","zan_num","comments_num"])
    print(df)
    df.to_excel("迪丽热巴所有的微博.xlsx")

if __name__ == "__main__":
    run_spider()
    save_data()

requests_weibo_comment.py

# time  : 2020/6/3 9:28
# author :wujiajia
# email :wujiajia666@qq.com
# file  : requests_weibo_comment.py
# Software: PyCharm
# python_version: 3.6
# funcation: 读取粉丝评论url，获得粉丝评论及粉丝信息

import pandas as pd
import requests
import  time
df = pd.read_excel("迪丽热巴所有的微博.xlsx")
comments_id = df["id"]
# v = comments_id[0]
# print(v)
# for i in comments_id[0:2]:
#     print(type(i))
# print(type(comments_id[0]))
# print(comments_id[0:2])
# print(requests.get(comments_url[0]).json)
# for num in range(0,len(comments_url[:2])):
#     requests.get(comments_url[num])
time_ = []  # 评论的时间
text = []  # 评论的文本内容
user_profile_url = []  # 评论者的信息
user_name = []  # 评论者的用户名
user_id = [] # 每个微博用户的唯一编号

# *****************************************spider_weibo_comments() 函数传入某个微博的id，即可以获得该微博的所有评论。
def spider_weibo_comments(weibo_id):
    count = 0  # 标记爬虫的页码
    print(count)
    total_page = 20  # 评论的总页码数量，每一页的有20条评论， 应该用 total/20 得到总页码数
    headers = {
        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Mobile Safari/537.36",
    }
    cookie = {
        "cookie": "复制浏览器的cookie信息到这里"
    }
    while count < total_page:  # 设计for循环
        if count == 0:  # 第一页，url 参数不包含max_id
            # urls = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id_type=0"
            url = "https://m.weibo.cn/comments/hotflow"
            # url_2 = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id=1290444946018013&max_id_type=0"
            # url_2 = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016"  # 首页
            data = {
                "id": str(weibo_id),  # id 是微博的编号
                "mid": str(weibo_id),
                # "max_id":"4511519801087016",
                # "max_id_type": "0",
            }  # print(data)
            # print(requests.get(url=url,headers=headers,cookies = cookie).json())
            time.sleep(5)
            try:
                data_html = requests.get(url=url, params=data, headers=headers, cookies=cookie).json()  #
                # time.sleep(20)
                # print(data_html)
                max_id = data_html["data"]["max_id"]  # max_id 参数是微博评论的下一页参数
                max_id_type = data_html["data"]["max_id_type"]  # max_id_type 参数是微博评论前15页的分界点， 0 是前15页，1表示第16页及以后
                # print("max_id______________",max_id)
                # print(requests.get(url=urls,headers=headers).json()["data"]["max_id"])
                #  获取字段，保存在对应的列表
                parse_html(data_html)  # 调用json 解析函数
                count += 1  # count 值变为1
                print("正在抓取第【{}】页".format(count))
            except Exception as e:
                print(str(count) + "遇到异常")
                continue

        if count < 16:  # 微博评论的前15页，url的 max_id_type 参数数值 是 0，
            next_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}".format(weibo_id, weibo_id, max_id,max_id_type)
            # print(max_id)
            print(next_url)
            time.sleep(5)
            try:
                data_html2 = requests.get(url=next_url,  headers=headers, cookies=cookie).json()
                print(data_html2)
                max_id= data_html2["data"]["max_id"]   # 用相同的变量名覆盖上一个 max_id 数值
                max_id_type= data_html2["data"]["max_id_type"]
                print(max_id)
                # time.sleep(20)
                parse_html(data_html2)
                count += 1
                print("正在抓取第【{}】页".format(count))
            except Exception as e:
                print(str(count) + "遇到异常",e)
                continue
        else:  # # 微博评论的前15页，url的 max_id_type 参数数值 是 0，  从第16页开始max_id_type 的数值为 1
            next_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}".format(weibo_id, weibo_id, max_id,max_id_type)
            # print(max_id)
            print(next_url)
            time.sleep(5)
            try:
                data_html2 = requests.get(url=next_url, headers=headers, cookies=cookie).json()
                print(data_html2)
                max_id = data_html2["data"]["max_id"]
                max_id_type = data_html2["data"]["max_id_type"]
                print(max_id)
                # time.sleep(20)
                parse_html(data_html2)
                count += 1
                print("正在抓取第【{}】页".format(count))
            except Exception as e:
                print(str(count) + "遇到异常")
                continue
                # return max_id

# *****************************解析网页，获取对应的字段
def  parse_html(data_html):
    for num in range(0, len(data_html["data"]["data"])):
        # print(num)
        s = data_html["data"]["data"]
        # print(s[num]["created_at"])
        time_.append(s[num]["created_at"])
        # print(s[num]["text"])
        text.append(s[num]["text"])
        # print(s[num]["user"]["profile_image_url"])
        user_profile_url.append(s[num]["user"]["profile_url"])  # 粉丝的微博主页，通过解析该主页获得该粉丝的地区位置。
        # print(s[num]["user"]["screen_name"])
        user_name.append(s[num]["user"]["screen_name"])
        user_id.append(s[num]["user"]["id"]) # 每个微博用户的唯一编号

# *********************调用评论爬虫函数
# comments_id 保存了迪丽热巴的所有微博编号，遍历所有的编号，并保存在excel中。
def run_spider_weiboComments():
    for i in  comments_id[0:4]:
        spider_weibo_comments(i)
# spider_weibo_comments(comments_id[3])

# ***********************保存数据
def save():
    V = list(zip(user_id,user_name, time_,text,user_profile_url,))
    df = pd.DataFrame(V,columns=["userid","username","time","text","user_profile_url"])
    # print(df)
    df.to_excel("id为4511519801087016微博的所有评论.xlsx")
    print("结束任务")

if __name__ == "__main__":
    run_spider_weiboComments()
    save()

user_infro.py

# time  : 2020/6/5 23:35
# author :wujiajia
# email :wujiajia666@qq.com
# file  : user_infro.py
# Software: PyCharm
# python_version: 3.6
# funcation:  微博用户信息的爬取接口  "https://weibo.cn/5479678683/info"        5479678683  是 用户id
import requests
from lxml import etree
import pandas  as pd
import time
df = pd.read_excel("id为4511519801087016微博的所有评论.xlsx")
user_ids = df["userid"]


name_ = []
sex_ = []
address_ = []
birth_ = []
infr_dic = {}
#  获取粉丝信息
def get_user_infro(user_id): # 5479678683
    url = "https://weibo.cn/{}/info".format(user_id)
    # "https://weibo.cn/5479678683/info"
    # headers = {
    #     "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Mobile Safari/537.36",
    # }
    cookie = {
        "cookie": "复制浏览中的cookie信息到这里"
    }
    html = requests.get(url=url,cookies=cookie,verify=False).content
    tree = etree.HTML(html)
    # print(tree)
    import re
    path_infro = '//div[6]/text()'
    # path_username = '//*[@class="dper-info"]/a/text()'
    user_content = tree.xpath(path_infro)
    print(user_content)

    new_user = []
    for i in user_content:
        # print(type(i.split(":")[0]))
        # print("".join(i))
        infr_dic[i[0:2]] = i[3:]
        if "生日" not in i.split(":")[0]:  # 如果用户没有生日信息
            infr_dic["生日"]=""
    name_.append(infr_dic["昵称"])
    sex_.append(infr_dic["性别"])
    address_.append(infr_dic["地区"])
    birth_.append(infr_dic["生日"])

def save():
    print(list(zip(name_,sex_,address_,birth_)))
    v = list(zip(name_,sex_,address_,birth_))
    import pandas as pd
    df = pd.DataFrame(v,columns=["name","sex","address","birth"])
    df.to_excel("粉丝位置信息_2.xlsx")
# *********************调用粉丝爬虫函数
def run_spider_user():
    for index,i in enumerate(user_ids[0:1000]):
        print(index,i)
        time.sleep(3)
        try:
            get_user_infro(i)
        except Exception as e:
            print()
            print("出错，保存前{}个粉丝信息".format(index))
            save()
            break
# get_user_infro(5335275912)
if __name__== "__main__":
    run_spider_user()

map_weibo_fensi.py

# time  : 2020/6/10 17:26
# author :wujiajia
# email :wujiajia666@qq.com
# file  : map_weibo_fensi.py
# Software: PyCharm
# python_version: 
# funcation:  将迪丽热巴的粉丝在全国范围内显示
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Map

df = pd.read_excel("E:\PycharmProjects\webspider2020\weibo\粉丝位置信息_2.xlsx")
# print(df["address"].value_counts())
# print(list(i[0:2] for i in df["address"]))
data = list(i[0:2] for i in df["address"])
dfs = pd.DataFrame(data,columns=["省份"])
# print(dfs)
# print(dfs["省份"].value_counts())
dfz = dfs["省份"].value_counts()
# print(dfz.index.tolist())
s = dfz.index.tolist()  # 省份名称
# print((list(str(i)for i in dfz.tolist())))
v = list(str(i)for i in dfz.tolist())  # 各省份的粉丝数量
print(list(zip(s,v)))
da = list(zip(s,v))[2:]  # 地图数据的输入数据 省份+对应的粉丝数量

def map_china() -> Map:
    c = (
        Map()
        .add(series_name="粉丝数量", data_pair=da, maptype="china",zoom = 1,center=[105,38])
        .set_global_opts(
            title_opts=opts.TitleOpts(title="粉丝分布"),
            visualmap_opts=opts.VisualMapOpts(max_=9999,is_piecewise=True,
                            pieces=[{"max": 9, "min": 0, "label": "0-9","color":"#FFE4E1"},
                                    {"max": 20, "min": 10, "label": "10-20","color":"#FF7F50"},
                                    {"max": 30, "min": 20, "label": "20-30","color":"#F08080"},
                                    {"max": 40, "min": 30, "label": "30-40","color":"#CD5C5C"},
                                    {"max": 50, "min": 40, "label": ">=40", "color":"#8B0000"}]
                                             )
        )
    )
    return c

d_map = map_china()
d_map.render("迪丽热巴中国粉丝分布.html")



# 粉丝性别分析   饼状图
sex_ = df["sex"].value_counts()
print(sex_)
# print()
v_ = sex_.tolist()  # [442,81]
s_ = sex_.index.tolist()  # ["女","男"]
# print()
# data_ = list(zip(s_,v_))

from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker

c = (
    Pie()
    .add("", [list(z) for z in zip(s_, v_)])
    .set_colors(["orange", "purple"])
    .set_global_opts(title_opts=opts.TitleOpts(title="粉丝男女分布"))
    .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    .render("粉丝男女分布.html")
)

代码及数据表格的获取方式：

关注公账号后台回复【新浪微博】