爬虫文件
# -*- coding: utf-8 -*-
import scrapy
class TestSpider(scrapy.Spider):
name = 'test'
# allowed_domains = ['www.baidu.com']
start_urls = ['https://www.youkuaiyun.com/1']
# 本地爬虫配置文件
custom_settings = {
'DOWNLOADER_MIDDLEWARES' : {
'Test_C.middlewares.Random_UA': 1,
# 'Test_C.middlewares.RZ_proxy':2
},
# 自动退出
'SPIDER_MIDDLEWARES':{
'Test_C.middlewares.Close_spider': 20,
},
# 下载超时时间
'DOWNLOAD_TIMEOUT':5,
# 下载重试次数
'RETRY_TIMES':3
}
def parse(self, response):
print('*_'*20)
print(response.status)
print('*_'*20)
middlewares.py 文件
from scrapy.exceptions import CloseSpider
class Close_spider(object):
def process_spider_input(self,response,spider):
if not 200 <= response.status <= 300:
raise CloseSpider('%s爬虫异常,退出!'%response.url)
return None
def process_spider_output(self,response,result,spider):
for res in result:
yield res