前言
parse指令用于返回requests和items对象,可用于一个爬虫的单元测试,检验抓取结果(注意:这里当你使用一个不存在的domain时候scrapy parse http://www.baidu.com,会抛出异常,这里是可选的源码解决方案点击打开链接)
代码调试
进入parse模块的run方法
初始化spidercls对象
self.set_spidercls(url, opts)
def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url}) # Request requires callback argument as callable or None, not string request = Request(url, None) # 初始化request,请求链接 _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests
(以上就是俩种方式初始化spidercls对象,一种是如果有spider选项,则直接使用spider加载器进行加载初始化,另一种是根据request对象找到匹配的相应的spider进行初始化)
这里有一个初始化start_request的方法
_start_requests = lambda s: [self.prepare_request(s, request, opts)]
获取requests和items对象
def prepare_request(self, spider, request, opts): def callback(response): # memorize first request if not self.first_response: self.first_response = response # determine real callback cb = response.meta['_callback'] if not cb: # 从选项callback中获取 if opts.callback: cb = opts.callback # 从spider的rules属性中获取,只获取一次 elif opts.rules and self.first_response == response: cb = self.get_callback_from_rules(spider, response) if not cb: logger.error('Cannot find a rule that matches %(url)r in spider: %(spider)s', {'url': response.url, 'spider': spider.name}) return # spider的parse方法 else: cb = 'parse' if not callable(cb): # 获取相应的回调方法 cb_method = getattr(spider, cb, None) if callable(cb_method): cb = cb_method else: logger.error('Cannot find callback %(callback)r in spider: %(spider)s', {'callback': cb, 'spider': spider.name}) return # parse items and requests depth = response.meta['_depth'] # 解析获取requests和items items, requests = self.run_callback(response, cb) if opts.pipelines: itemproc = self.pcrawler.engine.scraper.itemproc for item in items: itemproc.process_item(item, spider) self.add_items(depth, items) self.add_requests(depth, requests) if depth < opts.depth: for req in requests: req.meta['_depth'] = depth + 1 req.meta['_callback'] = req.callback req.callback = callback return requests #update request meta if any extra meta was passed through the --meta/-m opts. if opts.meta: request.meta.update(opts.meta) request.meta['_depth'] = 1 request.meta['_callback'] = request.callback request.callback = callback return request
初始化爬虫请求队列
self.spidercls.start_requests = _start_requests
开始执行爬虫并输出结果
if self.spidercls and opts.depth > 0: self.start_parsing(url, opts) self.print_results(opts)
爬虫调度
def start_parsing(self, url, opts): self.crawler_process.crawl(self.spidercls, **opts.spargs) self.pcrawler = list(self.crawler_process.crawlers)[0] self.crawler_process.start() if not self.first_response: logger.error('No response downloaded for: %(url)s', {'url': url})
github地址
https://github.com/wangrenlei/debug_scrapy
结束