Scrapy使用requests下载器

部署运行你感兴趣的模型镜像
""" 大部分的scrapy下载中间件都不要 """
import logging
import random
from concurrent.futures import ThreadPoolExecutor

import requests
import scrapy
from requests.adapters import HTTPAdapter
from scrapy.http import TextResponse
from twisted.internet.defer import Deferred
from urllib3.util.ssl_ import create_urllib3_context

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S',
                    format='%(asctime)s [%(name)s:%(module)s:%(funcName)s:%(lineno)d] %(levelname)s: %(message)s')


class DESAdapter(HTTPAdapter):
    """ JA3 """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        origin_ciphers = 'ECDH+AESGCM:DH+AESGCM:ECDH+AES256:DH+AES256:ECDH+AES128:DH+AES:ECDH+HIGH:DH+HIGH:ECDH+3DES:DH+3DES:RSA+AESGCM:RSA+AES:RSA+HIGH:RSA+3DES'
        ciphers = origin_ciphers.split(':')
        random.shuffle(ciphers)
        self.ciphers = ':'.join(ciphers) + ':!aNULL:!eNULL:!MD5'

    def get_connection(self, *args, **kwargs):
        obj = super(DESAdapter, self).get_connection(*args, **kwargs)
        obj.conn_kw['ssl_context'] = create_urllib3_context(ciphers=self.ciphers)
        return obj


class RequestsDownloadMiddleware:

    def __init__(self):
        self.executor = ThreadPoolExecutor(max_workers=7)

    def process_request(self, request, spider):
        d = Deferred()
        self.executor.submit(self.defer, d, request, spider)
        return d

    def defer(self, deferred: Deferred, request: scrapy.Request, spider: scrapy.Spider):
        try:
            resp = self.download(request)
            response = TextResponse(
                url=request.url,
                status=resp.status_code,
                headers=resp.headers,
                body=resp.content,
                request=request
            )
        except Exception as e:
            logger.warning(e)
            response = TextResponse(url=request.url, status=500, request=request)
        deferred.callback(response)

    def download(self, request) -> requests.Response:
        session = self.create_session({})
        return session.request(method=request.method, url=request.url, data=request.body,
                               headers=dict(request.headers.to_unicode_dict()), timeout=10)

    def create_session(self, proxies) -> requests.Session:
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Cache-Control': 'no-cache',
            'Connection': 'close',
            'User-Agent': f'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(105, 128)}.0.0.0 Safari/537.36',
        }
        s = requests.Session()
        s.headers.update(headers)
        s.proxies.update(proxies)
        s.verify = False
        s.mount('https://', DESAdapter())
        return s

您可能感兴趣的与本文相关的镜像

Python3.8

Python3.8

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值