获取微博搜索话题数据方法对比:1、不使用框架;2、使用screapy框架

微博搜索"洛杉矶大火"并获取:话题url、相关内容、用户ID、用户名称、用户描述、用户认证、粉丝量

1、不使用爬虫框架

import urllib
from lxml import html
import requests

headers = {
    "accept": "application/json, text/plain, */*",
    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,ko;q=0.7",
    "cache-control": "no-cache",
    "mweibo-pwa": "1",
    "pragma": "no-cache",
    "priority": "u=1, i",
    "referer": "https://m.weibo.cn/search?containerid=100103type%3D1%26q%3D%E6%B4%9B%E6%9D%89%E7%9F%B6%E5%A4%A7%E7%81%AB",
    "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"macOS\"",
    "sec-fetch-dest": "empty",
    "sec-fetch-mode": "cors",
    "sec-fetch-site": "same-origin",
    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
    "x-requested-with": "XMLHttpRequest",
    "x-xsrf-token": "3ae2d3"
}
cookies = {
    "_T_WM": "81286126068",
    "WEIBOCN_FROM": "1110006030",
    "MLOGIN": "0",
    "XSRF-TOKEN": "3ae2d3",
    "mweibo_short_token": "7f85765ed5",
    "M_WEIBOCN_PARAMS": "oid%3D5121194879880289%26luicode%3D10000011%26lfid%3D100103type%253D1%2526q%253D%25E6%25B4%259B%25E6%259D%2589%25E7%259F%25B6%25E5%25A4%25A7%25E7%2581%25AB%26fid%3D100103type%253D1%2526q%253D%25E6%25B4%259B%25E6%259D%2589%25E7%259F%25B6%25E5%25A4%25A7%25E7%2581%25AB%26uicode%3D10000011"
}
url = "https://m.weibo.cn/api/container/getIndex"
params = {
    "containerid": "100103type=1&q=洛杉矶大火",
    "page_type": "searchall",
    "page":0
}
response = requests.get(url, headers=headers, cookies=cookies, params=params)

# 获取微博信息
weibo_info = response.json()

# print(len(weibo_info['data']['cards']))

# 循环获取 网页url、洛杉矶大火相关内容、用户ID、用户名称、用户描述、用户认证、关注量
# 分析发现card_type等于4时没有话题内容、等于9时有话题内容、等于11时有包含card_type=9的嵌套,也有话题内容
for item in weibo_info['data']['cards']:

    # 获取话题信息
    def get_weibo_info(mblog):
        base_url = 'https://m.weibo.cn/detail/'
        # 话题" 洛杉矶大火" 相关话题网站url
        target_url = f'{base_url}{mblog["id"]}'
        # 内容
        text = urllib.parse.unquote(mblog["text"])  # 将url编码字符 转化成字符串
        ht = html.fromstring(text)
        target_text = ht.xpath("//text()[not(ancestor::a)]")[0]  # 提取非 <a> 标签内的文本
        # 用户id
        target_user_id = mblog["user"]["id"]
        # 用户名称
        target_name = mblog["user"]["screen_name"]
        # 用户描述
        target_des = mblog["user"]["description"]
        # 用户认证
        target_verified_reason = mblog["user"]["verified_reason"]
        # 关注量
        target_followers_count = mblog["user"]["followers_count_str"]

        target_info = f'"话题url": {target_url}\n' \
                      f'"话题内容": {target_text}\n' \
                      f'"用户id": {target_user_id}\n' \
                      f'"用户名称": {target_name}\n' \
                      f'"用户描述": {target_des}\n' \
                      f'"用户认证": {target_verified_reason}\n' \
                      f'"关注量": {target_followers_count}'
        print(target_info)

    if item["card_type"] != 4:  # 剔除card_type等于4的噪音数据

        # print(item["card_type"])
        if item["card_type"] == 9:
            # data = item.keys()
            # print(data)
            # # print(item['mblog'])
            # 打印信息
            get_weibo_info(item['mblog'])

        elif item["card_type"] == 11:
            # data = item.keys()
            # print(data)
            for i in item['card_group']:
                # print(i['card_type'])
                if i['card_type'] == 9:  # 剔除噪音数据
                    # print(i['card_type'])
                    # 打印信息
                    get_weibo_info(i['mblog'])

            print("***************")

        print("-----------------")

结果输出:

2、使用scrapy框架

a.创建scrapy 项目: 

scrapy startproject weibo_spider weiboProject

b.创建爬虫(注意url是api的地址,不是浏览器输入的地址)

scrapy genspider weiboSpider "https://m.weibo.cn/api/container/getIndex"

c. 修改 settings文件

1、不遵守网站robots.text规则, 可查看具体规则 https://m.weibo.cn/robots.txt
    ROBOTSTXT_OBEY = False 
2、设置USER_AGENT 
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"

d、weiboSpider 代码

import urllib
from urllib.parse import urlencode
import scrapy
from lxml import html


class WeibospiderSpider(scrapy.Spider):
    name = "weiboSpider"
    allowed_domains = ["m.weibo.cn"]
    start_urls = "https://m.weibo.cn/api/container/getIndex"

    def start_requests(self):
        # 定义参数
        params = {
            "containerid": "100103type=1&q=洛杉矶大火",
            "page_type": "searchall",
            "page": 0
        }

        # 构造完整的 URL
        url_with_params = f"{self.start_urls}?{urlencode(params)}"

        # 发起请求
        yield scrapy.Request(
            url=url_with_params,
            callback=self.parse
        )

    def parse(self, response):
        weibo_info = response.json()

        for item in weibo_info['data']['cards']:

            # 获取话题信息
            def get_weibo_info(mblog):
                base_url = 'https://m.weibo.cn/detail/'
                # 话题" 洛杉矶大火" 相关话题网站url
                target_url = f'{base_url}{mblog["id"]}'
                # 内容
                text = urllib.parse.unquote(mblog["text"])  # 将url编码字符 转化成字符串
                ht = html.fromstring(text)
                target_text = ht.xpath("//text()[not(ancestor::a)]")[0]  # 提取非 <a> 标签内的文本
                # 用户id
                target_user_id = mblog["user"]["id"]
                # 用户名称
                target_name = mblog["user"]["screen_name"]
                # 用户描述
                target_des = mblog["user"]["description"]
                # 用户认证
                target_verified_reason = mblog["user"]["verified_reason"]
                # 关注量
                target_followers_count = mblog["user"]["followers_count_str"]

                target_info = f'"话题url": {target_url}\n' \
                              f'"话题内容": {target_text}\n' \
                              f'"用户id": {target_user_id}\n' \
                              f'"用户名称": {target_name}\n' \
                              f'"用户描述": {target_des}\n' \
                              f'"用户认证": {target_verified_reason}\n' \
                              f'"关注量": {target_followers_count}'
                print(target_info)

            if item["card_type"] != 4:  # 剔除card_type等于4的噪音数据

                # print(item["card_type"])
                if item["card_type"] == 9:
                    # data = item.keys()
                    # print(data)
                    # # print(item['mblog'])
                    # 打印信息
                    get_weibo_info(item['mblog'])

                elif item["card_type"] == 11:
                    # data = item.keys()
                    # print(data)
                    for i in item['card_group']:
                        # print(i['card_type'])
                        if i['card_type'] == 9:  # 剔除噪音数据
                            # print(i['card_type'])
                            # 打印信息
                            get_weibo_info(i['mblog'])
                    print("***************")

                print("-----------------")

        # pass

d、执行脚本:  scrapy crawl     weiboSpider

3、scrapy shell 命令查看内容

a、首先需要进入创建的scrapy项目根目录: cd weiboProject

b、然后执行scrapy shell 命令:

scrapy shell 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%E6%B4%9B%E6%9D%89%E7%9F%B6%E5%A4%A7%E7%81%AB&page_type=searchall&page=0'

若看到 response 200,表示返回正常

c、执行数据获取测试,更新weiboSpider代码

4、思考

scrapy 框架学习成本较高,案例没有突出scrapy优势,但在开发分布式爬虫时事半功倍。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值