微博搜索"洛杉矶大火"并获取:话题url、相关内容、用户ID、用户名称、用户描述、用户认证、粉丝量
1、不使用爬虫框架
import urllib
from lxml import html
import requests
headers = {
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8,ko;q=0.7",
"cache-control": "no-cache",
"mweibo-pwa": "1",
"pragma": "no-cache",
"priority": "u=1, i",
"referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E6%B4%9B%E6%9D%89%E7%9F%B6%E5%A4%A7%E7%81%AB",
"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
"x-xsrf-token": "3ae2d3"
}
cookies = {
"_T_WM": "81286126068",
"WEIBOCN_FROM": "1110006030",
"MLOGIN": "0",
"XSRF-TOKEN": "3ae2d3",
"mweibo_short_token": "7f85765ed5",
"M_WEIBOCN_PARAMS": "oid%3D5121194879880289%26luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E6%25B4%259B%25E6%259D%2589%25E7%259F%25B6%25E5%25A4%25A7%25E7%2581%25AB%26fid%3D100103type%253D1%2526q%253D%25E6%25B4%259B%25E6%259D%2589%25E7%259F%25B6%25E5%25A4%25A7%25E7%2581%25AB%26uicode%3D10000011"
}
url = "https://m.weibo.cn/api/container/getIndex"
params = {
"containerid": "100103type=1&q=洛杉矶大火",
"page_type": "searchall",
"page":0
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)
# 获取微博信息
weibo_info = response.json()
# print(len(weibo_info['data']['cards']))
# 循环获取 网页url、洛杉矶大火相关内容、用户ID、用户名称、用户描述、用户认证、关注量
# 分析发现card_type等于4时没有话题内容、等于9时有话题内容、等于11时有包含card_type=9的嵌套,也有话题内容
for item in weibo_info['data']['cards']:
# 获取话题信息
def get_weibo_info(mblog):
base_url = 'https://m.weibo.cn/detail/'
# 话题" 洛杉矶大火" 相关话题网站url
target_url = f'{base_url}{mblog["id"]}'
# 内容
text = urllib.parse.unquote(mblog["text"]) # 将url编码字符 转化成字符串
ht = html.fromstring(text)
target_text = ht.xpath("//text()[not(ancestor::a)]")[0] # 提取非 <a> 标签内的文本
# 用户id
target_user_id = mblog["user"]["id"]
# 用户名称
target_name = mblog["user"]["screen_name"]
# 用户描述
target_des = mblog["user"]["description"]
# 用户认证
target_verified_reason = mblog["user"]["verified_reason"]
# 关注量
target_followers_count = mblog["user"]["followers_count_str"]
target_info = f'"话题url": {target_url}\n' \
f'"话题内容": {target_text}\n' \
f'"用户id": {target_user_id}\n' \
f'"用户名称": {target_name}\n' \
f'"用户描述": {target_des}\n' \
f'"用户认证": {target_verified_reason}\n' \
f'"关注量": {target_followers_count}'
print(target_info)
if item["card_type"] != 4: # 剔除card_type等于4的噪音数据
# print(item["card_type"])
if item["card_type"] == 9:
# data = item.keys()
# print(data)
# # print(item['mblog'])
# 打印信息
get_weibo_info(item['mblog'])
elif item["card_type"] == 11:
# data = item.keys()
# print(data)
for i in item['card_group']:
# print(i['card_type'])
if i['card_type'] == 9: # 剔除噪音数据
# print(i['card_type'])
# 打印信息
get_weibo_info(i['mblog'])
print("***************")
print("-----------------")
结果输出:
2、使用scrapy框架
a.创建scrapy 项目:
scrapy startproject weibo_spider weiboProject
b.创建爬虫(注意url是api的地址,不是浏览器输入的地址)
scrapy genspider weiboSpider "https://m.weibo.cn/api/container/getIndex"
c. 修改 settings文件
1、不遵守网站robots.text规则, 可查看具体规则 https://m.weibo.cn/robots.txt
ROBOTSTXT_OBEY = False
2、设置USER_AGENT
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
d、weiboSpider 代码
import urllib
from urllib.parse import urlencode
import scrapy
from lxml import html
class WeibospiderSpider(scrapy.Spider):
name = "weiboSpider"
allowed_domains = ["m.weibo.cn"]
start_urls = "https://m.weibo.cn/api/container/getIndex"
def start_requests(self):
# 定义参数
params = {
"containerid": "100103type=1&q=洛杉矶大火",
"page_type": "searchall",
"page": 0
}
# 构造完整的 URL
url_with_params = f"{self.start_urls}?{urlencode(params)}"
# 发起请求
yield scrapy.Request(
url=url_with_params,
callback=self.parse
)
def parse(self, response):
weibo_info = response.json()
for item in weibo_info['data']['cards']:
# 获取话题信息
def get_weibo_info(mblog):
base_url = 'https://m.weibo.cn/detail/'
# 话题" 洛杉矶大火" 相关话题网站url
target_url = f'{base_url}{mblog["id"]}'
# 内容
text = urllib.parse.unquote(mblog["text"]) # 将url编码字符 转化成字符串
ht = html.fromstring(text)
target_text = ht.xpath("//text()[not(ancestor::a)]")[0] # 提取非 <a> 标签内的文本
# 用户id
target_user_id = mblog["user"]["id"]
# 用户名称
target_name = mblog["user"]["screen_name"]
# 用户描述
target_des = mblog["user"]["description"]
# 用户认证
target_verified_reason = mblog["user"]["verified_reason"]
# 关注量
target_followers_count = mblog["user"]["followers_count_str"]
target_info = f'"话题url": {target_url}\n' \
f'"话题内容": {target_text}\n' \
f'"用户id": {target_user_id}\n' \
f'"用户名称": {target_name}\n' \
f'"用户描述": {target_des}\n' \
f'"用户认证": {target_verified_reason}\n' \
f'"关注量": {target_followers_count}'
print(target_info)
if item["card_type"] != 4: # 剔除card_type等于4的噪音数据
# print(item["card_type"])
if item["card_type"] == 9:
# data = item.keys()
# print(data)
# # print(item['mblog'])
# 打印信息
get_weibo_info(item['mblog'])
elif item["card_type"] == 11:
# data = item.keys()
# print(data)
for i in item['card_group']:
# print(i['card_type'])
if i['card_type'] == 9: # 剔除噪音数据
# print(i['card_type'])
# 打印信息
get_weibo_info(i['mblog'])
print("***************")
print("-----------------")
# pass
d、执行脚本: scrapy crawl weiboSpider
3、scrapy shell 命令查看内容
a、首先需要进入创建的scrapy项目根目录: cd weiboProject
b、然后执行scrapy shell 命令:
scrapy shell 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E6%B4%9B%E6%9D%89%E7%9F%B6%E5%A4%A7%E7%81%AB&page_type=searchall&page=0'
若看到 response 200,表示返回正常
c、执行数据获取测试,更新weiboSpider代码
4、思考
scrapy 框架学习成本较高,案例没有突出scrapy优势,但在开发分布式爬虫时事半功倍。