1.常见的scrapy配置
1.允许错误状态响应码通过
HTTPERROR_ALLOWED_CODES = [403]
2.scrapy启用过滤
yield scrapy.Request(url=url, cookies=self.cookies, callback=self.parse, dont_filter=True)
3.scrapy重试(retry)
RETRY_ENABLED = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [429,404,403]
4.scrapy默认打开的settings
DOWNLOADER_MIDDLEWARES_BASE = {
'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': 100,
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware': 300,
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware': 350,
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware': 400,
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 500,
'scrapy.downloadermiddlewares.retry.RetryMiddleware': 550,
'scrapy.downloadermiddlewares.ajaxcrawl.AjaxCrawlMiddleware': 560,
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware': 580,
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 590,
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware': 600,
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 700,
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 750,
'scrapy.downloadermiddlewares.stats.DownloaderStats': 850,
'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware': 900,
}
5.下载超时
DOWNLOAD_TIME=180
6.scrapy日志等级
- CRITICAL:严重错误
- ERROR: 一般错误
- WARNING: 警告
- INFO: 一般信息
- DEBUG: 调试信息