提供代码运行过程的相关咨询服务,确保代码成功运行。
本文代码已同步更新到,下面公众号。请大家关注公众号 【 Python人生之技术实践】, 近期将推出 豆瓣、知乎、微博、网易云音乐、拉勾网、知网的爬虫教程,代码已经完成,在整理阶段谢谢。

爬虫方式:
1 爬取某个话题的所有的评论
2 爬取某个博主的所有微博的评论
3 爬取某个博主的所有粉丝的用户信息,位置信息
微博评论爬虫难点:
获取max_id 和 max_id_type 参数
以“迪丽热巴的微博”为案例,爬取热巴的所有微博,所有微博评论,所有粉丝的用户信息,并对所有的粉丝进行位置分析,在地图上显示粉丝分布。


1 requests_weibo.py功能:爬取迪丽热巴的所有的微博,微博内容的text保存在excel中,包括微博id
2 requests_weibo_comment.py 功能:读取热巴所有的微博id, 爬取某个微博的所有评论内容,粉丝id
3 user_infro.py 功能:读取所有粉丝id, 爬取微博用户(热巴粉丝)的基本信息(生日,位置,性别)。粉丝信息接口在代码中。
4 map_weibo_fensi.py: 对粉丝的性别和位置信息进行可视化
注意事项:
新浪微博评论内容获取时,url参数第1页,第2页至第15页,第16页及以后页码,参数是不同的。
第1页:https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016
第2 -15 页:max_id_type 的数值为 0
https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id=1290444946018013&max_id_type=0
第16页及以后:
从第16页开始max_id_type 的数值为 1, 这里最后是通过请求获取该值,不要手动修改为1。
requests_weibo.py
# time : 2020/6/1 11:37
# author :wujiajia
# email :wujiajia666@qq.com
# file : requests_weibo.py
# Software: PyCharm
# python_version: 3.6
# funcation: 获得迪丽热巴的所有微博及粉丝评论的详细url
import requests
import time
# ************************
time_ = [] # 微博发布时间
comment_url = [] # 某个微博的详细内容url
text = [] #微博文字内容
comment_num = [] #评论数
zan_num = [] # 点赞数
id = [] # 每个微博的唯一编号, 需要id 构造粉丝评论url https://m.weibo.cn/comments/hotflow?id=4510598555019129&mid=4510598555019129&max_id_type=0
def spider(a): # 参数a 表示微博的第几页
url = "https://m.weibo.cn/api/container/getIndex" # m. 表示手机登入浏览
# 请求头要改为 手机或者ipad模式
headers = {'User-Agent': 'Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) AppleWebKit/604.1.34 (KHTML, like Gecko) Version/11.0 Mobile/15A5341f Safari/604.1'}
data = { # 这个data 就是 xhr 中 的查询参数
"uid": "1669879400",
"t": "0",
"luicode": "10000011",
"lfid": "100103type=1&q=迪丽热巴",
"type": "uid",
"value": "1669879400",
"containerid": "1076031669879400",
"page": str(a)
}
data_html = requests.get(url = url,params=data,headers=headers).json(encoding ="utf-8") #获取json数据
# print(data_html)
for num in range(1,len(data_html["data"]["cards"])): # len(data_html["data"]["cards"]) 表示每个分页有多少条数据
if data_html["data"]["cards"][num]["card_type"] == 9:
# print(data_html["data"]["cards"][num]["mblog"]["text"])
time_.append(data_html["data"]["cards"][num]["mblog"]["created_at"])
text.append(data_html["data"]["cards"][num]["mblog"]["text"])
id.append(data_html["data"]["cards"][num]["mblog"]["id"])
comment_url.append(data_html["data"]["cards"][num]["scheme"])
zan_num.append(data_html["data"]["cards"][num]["mblog"]["attitudes_count"])
if data_html["data"]["cards"][num]["mblog"]["comments_count"] == "100万+": #将100万+ 转换为数字
comment_num.append(1000000)
else:
comment_num.append(data_html["data"]["cards"][num]["mblog"]["comments_count"])
# **************调用spider,爬取多个页面
def run_spider():
# spider(1)
for i in range(0,144): # 144 表示一共有多少条微博
print("--正在抓取第【{}】页--".format(i))
time.sleep(5)
spider(i)
# print(list(zip(time_,text,comment_url)))
# ***************保存数据
def save_data():
V = list(zip(id, time_,text,comment_url,zan_num,comment_num))
import pandas as pd
df = pd.DataFrame(V,columns=["id","time","text","comment_url","zan_num","comments_num"])
print(df)
df.to_excel("迪丽热巴所有的微博.xlsx")
if __name__ == "__main__":
run_spider()
save_data()
requests_weibo_comment.py
# time : 2020/6/3 9:28
# author :wujiajia
# email :wujiajia666@qq.com
# file : requests_weibo_comment.py
# Software: PyCharm
# python_version: 3.6
# funcation: 读取粉丝评论url,获得粉丝评论及粉丝信息
import pandas as pd
import requests
import time
df = pd.read_excel("迪丽热巴所有的微博.xlsx")
comments_id = df["id"]
# v = comments_id[0]
# print(v)
# for i in comments_id[0:2]:
# print(type(i))
# print(type(comments_id[0]))
# print(comments_id[0:2])
# print(requests.get(comments_url[0]).json)
# for num in range(0,len(comments_url[:2])):
# requests.get(comments_url[num])
time_ = [] # 评论的时间
text = [] # 评论的文本内容
user_profile_url = [] # 评论者的信息
user_name = [] # 评论者的用户名
user_id = [] # 每个微博用户的唯一编号
# *****************************************spider_weibo_comments() 函数传入某个微博的id,即可以获得该微博的所有评论。
def spider_weibo_comments(weibo_id):
count = 0 # 标记爬虫的页码
print(count)
total_page = 20 # 评论的总页码数量,每一页的有20条评论, 应该用 total/20 得到总页码数
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Mobile Safari/537.36",
}
cookie = {
"cookie": "复制浏览器的cookie信息到这里"
}
while count < total_page: # 设计for循环
if count == 0: # 第一页,url 参数不包含max_id
# urls = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id_type=0"
url = "https://m.weibo.cn/comments/hotflow"
# url_2 = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016&max_id=1290444946018013&max_id_type=0"
# url_2 = "https://m.weibo.cn/comments/hotflow?id=4511519801087016&mid=4511519801087016" # 首页
data = {
"id": str(weibo_id), # id 是微博的编号
"mid": str(weibo_id),
# "max_id":"4511519801087016",
# "max_id_type": "0",
} # print(data)
# print(requests.get(url=url,headers=headers,cookies = cookie).json())
time.sleep(5)
try:
data_html = requests.get(url=url, params=data, headers=headers, cookies=cookie).json() #
# time.sleep(20)
# print(data_html)
max_id = data_html["data"]["max_id"] # max_id 参数是微博评论的下一页参数
max_id_type = data_html["data"]["max_id_type"] # max_id_type 参数是微博评论前15页的分界点, 0 是前15页,1表示第16页及以后
# print("max_id______________",max_id)
# print(requests.get(url=urls,headers=headers).json()["data"]["max_id"])
# 获取字段,保存在对应的列表
parse_html(data_html) # 调用json 解析函数
count += 1 # count 值变为1
print("正在抓取第【{}】页".format(count))
except Exception as e:
print(str(count) + "遇到异常")
continue
if count < 16: # 微博评论的前15页,url的 max_id_type 参数数值 是 0,
next_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}".format(weibo_id, weibo_id, max_id,max_id_type)
# print(max_id)
print(next_url)
time.sleep(5)
try:
data_html2 = requests.get(url=next_url, headers=headers, cookies=cookie).json()
print(data_html2)
max_id= data_html2["data"]["max_id"] # 用相同的变量名覆盖上一个 max_id 数值
max_id_type= data_html2["data"]["max_id_type"]
print(max_id)
# time.sleep(20)
parse_html(data_html2)
count += 1
print("正在抓取第【{}】页".format(count))
except Exception as e:
print(str(count) + "遇到异常",e)
continue
else: # # 微博评论的前15页,url的 max_id_type 参数数值 是 0, 从第16页开始max_id_type 的数值为 1
next_url = "https://m.weibo.cn/comments/hotflow?id={}&mid={}&max_id={}&max_id_type={}".format(weibo_id, weibo_id, max_id,max_id_type)
# print(max_id)
print(next_url)
time.sleep(5)
try:
data_html2 = requests.get(url=next_url, headers=headers, cookies=cookie).json()
print(data_html2)
max_id = data_html2["data"]["max_id"]
max_id_type = data_html2["data"]["max_id_type"]
print(max_id)
# time.sleep(20)
parse_html(data_html2)
count += 1
print("正在抓取第【{}】页".format(count))
except Exception as e:
print(str(count) + "遇到异常")
continue
# return max_id
# *****************************解析网页,获取对应的字段
def parse_html(data_html):
for num in range(0, len(data_html["data"]["data"])):
# print(num)
s = data_html["data"]["data"]
# print(s[num]["created_at"])
time_.append(s[num]["created_at"])
# print(s[num]["text"])
text.append(s[num]["text"])
# print(s[num]["user"]["profile_image_url"])
user_profile_url.append(s[num]["user"]["profile_url"]) # 粉丝的微博主页,通过解析该主页获得该粉丝的地区位置。
# print(s[num]["user"]["screen_name"])
user_name.append(s[num]["user"]["screen_name"])
user_id.append(s[num]["user"]["id"]) # 每个微博用户的唯一编号
# *********************调用评论爬虫函数
# comments_id 保存了迪丽热巴的所有微博编号,遍历所有的编号,并保存在excel中。
def run_spider_weiboComments():
for i in comments_id[0:4]:
spider_weibo_comments(i)
# spider_weibo_comments(comments_id[3])
# ***********************保存数据
def save():
V = list(zip(user_id,user_name, time_,text,user_profile_url,))
df = pd.DataFrame(V,columns=["userid","username","time","text","user_profile_url"])
# print(df)
df.to_excel("id为4511519801087016微博的所有评论.xlsx")
print("结束任务")
if __name__ == "__main__":
run_spider_weiboComments()
save()
user_infro.py
# time : 2020/6/5 23:35
# author :wujiajia
# email :wujiajia666@qq.com
# file : user_infro.py
# Software: PyCharm
# python_version: 3.6
# funcation: 微博用户信息的爬取接口 "https://weibo.cn/5479678683/info" 5479678683 是 用户id
import requests
from lxml import etree
import pandas as pd
import time
df = pd.read_excel("id为4511519801087016微博的所有评论.xlsx")
user_ids = df["userid"]
name_ = []
sex_ = []
address_ = []
birth_ = []
infr_dic = {}
# 获取粉丝信息
def get_user_infro(user_id): # 5479678683
url = "https://weibo.cn/{}/info".format(user_id)
# "https://weibo.cn/5479678683/info"
# headers = {
# "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Mobile Safari/537.36",
# }
cookie = {
"cookie": "复制浏览中的cookie信息到这里"
}
html = requests.get(url=url,cookies=cookie,verify=False).content
tree = etree.HTML(html)
# print(tree)
import re
path_infro = '//div[6]/text()'
# path_username = '//*[@class="dper-info"]/a/text()'
user_content = tree.xpath(path_infro)
print(user_content)
new_user = []
for i in user_content:
# print(type(i.split(":")[0]))
# print("".join(i))
infr_dic[i[0:2]] = i[3:]
if "生日" not in i.split(":")[0]: # 如果用户没有生日信息
infr_dic["生日"]=""
name_.append(infr_dic["昵称"])
sex_.append(infr_dic["性别"])
address_.append(infr_dic["地区"])
birth_.append(infr_dic["生日"])
def save():
print(list(zip(name_,sex_,address_,birth_)))
v = list(zip(name_,sex_,address_,birth_))
import pandas as pd
df = pd.DataFrame(v,columns=["name","sex","address","birth"])
df.to_excel("粉丝位置信息_2.xlsx")
# *********************调用粉丝爬虫函数
def run_spider_user():
for index,i in enumerate(user_ids[0:1000]):
print(index,i)
time.sleep(3)
try:
get_user_infro(i)
except Exception as e:
print()
print("出错,保存前{}个粉丝信息".format(index))
save()
break
# get_user_infro(5335275912)
if __name__== "__main__":
run_spider_user()
map_weibo_fensi.py
# time : 2020/6/10 17:26
# author :wujiajia
# email :wujiajia666@qq.com
# file : map_weibo_fensi.py
# Software: PyCharm
# python_version:
# funcation: 将迪丽热巴的粉丝在全国范围内显示
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Map
df = pd.read_excel("E:\PycharmProjects\webspider2020\weibo\粉丝位置信息_2.xlsx")
# print(df["address"].value_counts())
# print(list(i[0:2] for i in df["address"]))
data = list(i[0:2] for i in df["address"])
dfs = pd.DataFrame(data,columns=["省份"])
# print(dfs)
# print(dfs["省份"].value_counts())
dfz = dfs["省份"].value_counts()
# print(dfz.index.tolist())
s = dfz.index.tolist() # 省份名称
# print((list(str(i)for i in dfz.tolist())))
v = list(str(i)for i in dfz.tolist()) # 各省份的粉丝数量
print(list(zip(s,v)))
da = list(zip(s,v))[2:] # 地图数据的输入数据 省份+对应的粉丝数量
def map_china() -> Map:
c = (
Map()
.add(series_name="粉丝数量", data_pair=da, maptype="china",zoom = 1,center=[105,38])
.set_global_opts(
title_opts=opts.TitleOpts(title="粉丝分布"),
visualmap_opts=opts.VisualMapOpts(max_=9999,is_piecewise=True,
pieces=[{"max": 9, "min": 0, "label": "0-9","color":"#FFE4E1"},
{"max": 20, "min": 10, "label": "10-20","color":"#FF7F50"},
{"max": 30, "min": 20, "label": "20-30","color":"#F08080"},
{"max": 40, "min": 30, "label": "30-40","color":"#CD5C5C"},
{"max": 50, "min": 40, "label": ">=40", "color":"#8B0000"}]
)
)
)
return c
d_map = map_china()
d_map.render("迪丽热巴中国粉丝分布.html")
# 粉丝性别分析 饼状图
sex_ = df["sex"].value_counts()
print(sex_)
# print()
v_ = sex_.tolist() # [442,81]
s_ = sex_.index.tolist() # ["女","男"]
# print()
# data_ = list(zip(s_,v_))
from pyecharts import options as opts
from pyecharts.charts import Pie
from pyecharts.faker import Faker
c = (
Pie()
.add("", [list(z) for z in zip(s_, v_)])
.set_colors(["orange", "purple"])
.set_global_opts(title_opts=opts.TitleOpts(title="粉丝男女分布"))
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
.render("粉丝男女分布.html")
)
代码及数据表格的获取方式:
关注公账号后台回复 【新浪微博】
谢谢大家,有任何代码问题都可以咨询,近期将持续推出爬虫系列,在公众号的文章【电子资源分享--xxxxxx】有近年自己整理的学习资料分享给大家,按照文章内容,在后台回复对应的关键词即可获得。

880

被折叠的 条评论
为什么被折叠?



