scrapy 模拟登陆并且爬取51cto 文章

最新推荐文章于 2021-06-22 20:44:46 发布

风萧萧兮易水寒！

最新推荐文章于 2021-06-22 20:44:46 发布

阅读量387

点赞数

分类专栏：爬虫文章标签：爬虫

爬虫专栏收录该内容

2 篇文章

订阅专栏

本文介绍了一个使用Scrapy框架实现的爬虫项目，详细解析了如何针对51CTO网站进行登录及博客内容抓取的过程。通过分析源代码，读者可以了解到如何处理网站的登录机制、提取csrf值、设置请求头以及如何解析并保存抓取到的数据。项目还展示了如何利用FormRequest处理POST请求，以及如何递归抓取多页内容。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

a51cto.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import FormRequest
from cto.items import CtospiderItem

class CtoSpider(scrapy.Spider):
    name = '51cto'
    allowed_domains = ['51cto.com']

    def start_requests(self):
        urls = ['http://home.51cto.com/index']
        for url in urls:
            yield scrapy.Request(url, callback=self.cto_login, meta={'cookiejar': 1})

    def cto_login(self, response):
        # 获取csrf值
        csrf = response.xpath("//input[@name='d1g0Smlta3o7DE0kJiU8OQM3WTMjXhtDJCp8JC0qADhPH2YbGT5dHw==']/@value").extract_first()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://blog.51cto.com',
            'Content-Type': 'application/x-www-form-urlencoded',
        }
        # 此处为logger输出供调试时使用
        # self.logger.info("获取csrf值为 %s" % csrf)
        yield FormRequest.from_response(response,
                                        url='https://blog.51cto.com/linuxliu?type=1',
                                        headers=headers,
                                        meta={'cookiejar': response.meta['cookiejar']},
                                        formdata={
                                                  # 这个位置注意0要加引号，不然会报错，这个参数意思是是否记住密码10天内自动登录
                                                  'LoginForm[rememberMe]': '0',
                                                  'LoginForm[username]': '****',
                                                  'LoginForm[password]': '****',
                                                  '_csrf': csrf,
                                                  },
                                        callback=self.after_login,
                                        dont_click=True,
                                        )

    def after_login(self, response):

        # 获取的网页内容
        home_page = response.xpath("//a[@class='con']/text()").extract()
        if 'wx5c789cd76c3af' in home_page:
            self.logger.info('我的博客')
        else:
            self.logger.error('登录失败')

        resps = response.css("ul.artical-list li")
        for resp in resps:
            # 写入item字段中
            item['title_url'] = resp.css("a.tit::attr(href)").extract_first()
            item['title'] = resp.css("a.tit::text").extract_first().strip()
            # fullname的格式为“[名称]（链接）”之所以这样是因为
            # markdown语法里这个表示链接的意思，点击名称直接打开链接内容
            item['fullname'] = '[' + item['title'] + ']' + '(' + item['title_url'] + ')'
            # 此处logger也是调试使用
            # self.logger.info("title url的值为：%s , title的值为%s" % (tit_url, tit))
            yield item

        # 下一页内容获取
        next_page = response.css('li.next a::attr(href)').extract_first()
        # self.logger.info("下一页链接为：%s" % next_page)
        if next_page is not None:
            yield scrapy.Request(next_page, callback=self.after_login)

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class CtospiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    title_url = scrapy.Field()
    fullname = scrapy.Field()