Scrapy 一个进程运行多个爬虫-优快云博客

本文链接：https://blog.youkuaiyun.com/songhao8080/article/details/103670274

首先创建相应的爬虫

spider 爬虫文件修改成这样

Python

# -*- coding: utf-8 -*- import <a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a> class Seo2Spider(<a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a>.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # -*- coding: utf-8 -*- import <a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a> class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first())

# -*- coding: utf-8 -*-

import scrapy

class Seo2Spider ( scrapy . Spider ) :

name = 'seo2'

allowed_domains = [ 'www.168seo.cn' ]

start_urls = [ 'http://www.168seo.cn/' ]

def parse ( self , response ) :

print ( response . css ( 'title::text' ) . extract_first ( ) )

# -*- coding: utf-8 -*-

import scrapy

class SeoSpider ( scrapy . Spider ) :

name = 'seo'

allowed_domains = [ 'www.168seo.cn' ]

start_urls = [ 'http://www.168seo.cn/' ]

def parse ( self , response ) :

print ( response . css ( 'title::text' ) . extract_first ( ) )

创建一个新的脚本main.py

main.py 文件代码如下：

Python

# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from scrapy.crawler import CrawlerProcess class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) process = CrawlerProcess() process.crawl(SeoSpider) process.crawl(Seo2Spider) process.start() # the script will block here until all crawling jobs are finished

# -*- coding: utf-8 -*-

"""

@Time: 2018/1/22

@Author: songhao

@微信公众号: zeropython

@File: main.py

"""

import scrapy

from scrapy . crawler import CrawlerProcess

class SeoSpider ( scrapy . Spider ) :

name = 'seo'

allowed_domains = [ 'www.168seo.cn' ]

start_urls = [ 'http://www.168seo.cn/' ]

def parse ( self , response ) :

print ( response . css ( 'title::text' ) . extract_first ( ) )

class Seo2Spider ( scrapy . Spider ) :

name = 'seo2'

allowed_domains = [ 'www.168seo.cn' ]

start_urls = [ 'http://www.168seo.cn/' ]

def parse ( self , response ) :

print ( response . css ( 'title::text' ) . extract_first ( ) )

process = CrawlerProcess ( )

process . crawl ( SeoSpider )

process . crawl ( Seo2Spider )

process . start ( ) # the script will block here until all crawling jobs are finished

效果如下：

方法二：

Python

# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging # 爬虫1 class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # 爬虫2 class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) configure_logging() runner = CrawlerRunner() runner.crawl(SeoSpider) runner.crawl(Seo2Spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished