记忆碎片之python爬虫scrapy框架settings.py参数

最新推荐文章于 2021-09-29 17:02:00 发布

原创最新推荐文章于 2021-09-29 17:02:00 发布 · 295 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫 #scrapy #settings #参数

爬虫专栏收录该内容

43 篇文章

订阅专栏

本文详细介绍了Scrapy爬虫的配置参数，包括并发请求、下载延迟、Cookies管理、User-Agent随机化、中间件及管道设置等关键内容，旨在帮助读者理解和优化爬虫性能。

# -*- coding: utf-8 -*-

# Scrapy settings for morekeywords project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import os
import sys

sys.path.append(os.path.dirname(os.path.abspath(".")))  # 获取到路径  这个正确
# print(os.path.dirname(os.path.abspath(".")))    D:\django\dataAnalysis\morekeywords
# print(os.path.dirname(os.path.abspath(__file__)))     D:\django\dataAnalysis\morekeywords\morekeywords    这个错误No module named 'dataAnalysis'
# sys.path.append(r"D:\django\dataAnalysis\dataAnalysis")      # 获取到路径
os.environ["DJANGO_SETTINGS_MODULE"] = "dataAnalysis.settings"
import django

django.setup()  # 初始化Django 和Django交互的第一步，需要爬虫部署之后才能启动
# 配置完成之后当前无法启动爬虫ModuleNotFoundError: No module named 'dataAnalysis'
# ---------------------------------------------------------------------

BOT_NAME = 'morekeywords'

SPIDER_MODULES = ['morekeywords.spiders']
NEWSPIDER_MODULE = 'morekeywords.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'morekeywords (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32  # 设置并发数量 Scrapy下载程序执行的并发（即同时）请求的最大数量。

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs

# 下载器在从同一网站下载连续页面之前应等待的时间（以秒为单位）。
# 这可以用于限制爬行速度，以避免连续向服务器发起请求。支持小数
DOWNLOAD_DELAY = 3  # 设置延迟下载间隔，单位：秒

# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# 部分网站会通过用户的Cookie信息对用户进行识别与分析，
# 所以要防止目标网站使用Cookie识别我们的会话信息。
COOKIES_ENABLED = False  # 禁用cookies
FEED_EXPORT_ENCODING = "utf-8"  # 官网说这可以指定编码
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Telnet用于查看当前爬虫的信息，操作爬虫等...
# 使用telnet ip port ，然后通过命令操作
# TELNETCONSOLE_ENABLED = True
# TELNETCONSOLE_HOST = '127.0.0.1'
# TELNETCONSOLE_PORT = [6023,]

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
    'Connection': 'keep-alive',
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Origin': 'https://www.amazon.com',
    'Sec-Fetch-Site': 'same-site',
    'Sec-Fetch-Mode': 'cors',
    'Referer': 'https://www.amazon.com/ref=nav_logo?language=en_US',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
}
RANDOM_UA_TYPE = "chrome"  # 使用谷歌的浏览器ua头
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'morekeywords.middlewares.MorekeywordsSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    # 'morekeywords.middlewares.MorekeywordsDownloaderMiddleware': 543,
    'morekeywords.middlewares.RandomUserAgentMiddlware': 542,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,  # 需要将原始的设置为none
}

# 自定义扩展，基于信号进行调用
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    # 'morekeywords.pipelines.MorekeywordsPipeline': 300,
    'morekeywords.pipelines.MysqlPipeline': 301,
}

# 访问URL去重
# DUPEFILTER_CLASS = 'morekeywords.duplication.RepeatUrl'

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True   # 开始自动限速
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5      # 初始下载延迟
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60       # 最大下载延迟
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0     # 平均每秒并发数
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False


"""
启用缓存
    目的用于将已经发送的请求或相应缓存下来，以便以后使用

    from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
    from scrapy.extensions.httpcache import DummyPolicy
    from scrapy.extensions.httpcache import FilesystemCacheStorage
"""

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True      # 是否启用缓存策略
# HTTPCACHE_EXPIRATION_SECS = 0    # 缓存超时时间
# HTTPCACHE_DIR = 'httpcache'       # 缓存保存路径
# HTTPCACHE_IGNORE_HTTP_CODES = []      # 缓存忽略的Http状态码
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'      # 缓存存储的插件