三行代码、三步实现 Scrapy 动态 User-Agent

诗与南巷

已于 2022-07-23 15:54:17 修改

阅读量622

点赞数 1

分类专栏： python scrapy 爬虫文章标签： scrapy python 开发语言

于 2022-07-23 14:28:09 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_44848436/article/details/125946972

版权

python 同时被 3 个专栏收录

1 篇文章

订阅专栏

scrapy

1 篇文章

订阅专栏

爬虫

1 篇文章

订阅专栏

Scrapy 使用 fake_useragent ua库实现随机User-Agent

设置动态User-Agent只需要三步

设置动态User-Agent只需要三步

      下载 随机ua库 ：pip install fake_useragent

1、找到并打开我们的下载中间件类：项目名+DownloaderMiddleware 在DownloaderMiddleware.py文件下
2、导入 User-Agent对象并在步骤一的类下找到 process_response() 方法并设置 User-Agent
3、在 settings 文件中打开我们的下载中间件


1、步骤一和二

#导入 ua库
from fake_useragent import UserAgent
class RandomheaderDownloaderMiddleware:
   #创建 Ua对象
    ua = UserAgent()

    def process_request(self, request, spider):
      # 设置请求头 User-Agent值
        request.headers['User-Agent'] = self.ua.random
     
        return None


2、步骤三
DOWNLOADER_MIDDLEWARES = {
		#在Setting.py 文件中找到下载器中间件 并解除注释
   'randomHeader.middlewares.RandomheaderDownloaderMiddleware': 543,
}

3、最后请求通过 spider 打印 response.request.headers[‘User-Agent’] 看到 User-Agent 已经设置成功
在这里插入图片描述

最后附上这几个py文件的完整代码供参考

# spider 执行类

import scrapy

class TestSpider(scrapy.Spider):
    name = 'test'
    allowed_domains = ['aidu.com']
    start_urls = ['https://www.baidu.com']

    def parse(self, response):
        print(response.request.headers['User-Agent'])
         yield scrapy.Request(url='https://ggzyjy.wenzhou.gov.cn/wzcms/zfcgcggg/index_1.htm')


middlewares.py




# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from fake_useragent import UserAgent
from selenium import webdriver
from scrapy.http import HtmlResponse

class RandomheaderSpiderMiddleware:
    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the spider middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_spider_input(self, response, spider):
        # Called for each response that goes through the spider
        # middleware and into the spider.

        # Should return None or raise an exception.
        return None

    def process_spider_output(self, response, result, spider):
        # Called with the results returned from the Spider, after
        # it has processed the response.

        # Must return an iterable of Request, or item objects.
        for i in result:
            yield i

    def process_spider_exception(self, response, exception, spider):
        # Called when a spider or process_spider_input() method
        # (from other spider middleware) raises an exception.

        # Should return either None or an iterable of Request or item objects.
        pass

    def process_start_requests(self, start_requests, spider):
        # Called with the start requests of the spider, and works
        # similarly to the process_spider_output() method, except
        # that it doesn’t have a response associated.

        # Must return only requests (not items).
        for r in start_requests:
            yield r

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


class RandomheaderDownloaderMiddleware:
    ua = UserAgent()

    # Not all methods need to be defined. If a method is not defined,
    # scrapy acts as if the downloader middleware does not modify the
    # passed objects.

    @classmethod
    def from_crawler(cls, crawler):
        # This method is used by Scrapy to create your spiders.
        s = cls()
        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
        return s

    def process_request(self, request, spider):

        request.headers['User-Agent'] = self.ua.random

        # Called for each request that goes through the downloader
        # middleware.

        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # Called when a download handler or a process_request()
        # (from other downloader middleware) raises an exception.

        # Must either:
        # - return None: continue processing this exception
        # - return a Response object: stops process_exception() chain
        # - return a Request object: stops process_exception() chain
        pass

    def spider_opened(self, spider):
        spider.logger.info('Spider opened: %s' % spider.name)


setting.py

# Scrapy settings for randomHeader project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'randomHeader'

SPIDER_MODULES = ['randomHeader.spiders']
NEWSPIDER_MODULE = 'randomHeader.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'randomHeader.middlewares.RandomheaderSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'randomHeader.middlewares.RandomheaderDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'randomHeader.pipelines.RandomheaderPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'