Scrapy 一个进程运行多个爬虫

首先创建相应的爬虫

spider 爬虫文件修改成这样

Python
# -*- coding: utf-8 -*- import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span> class Seo2Spider(<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span>.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # -*- coding: utf-8 -*- import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span> class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# -*- coding: utf-8 -*-
import scrapy
 
 
class Seo2Spider ( scrapy . Spider ) :
     name = 'seo2'
     allowed_domains = [ 'www.168seo.cn' ]
     start_urls = [ 'http://www.168seo.cn/' ]
 
     def parse ( self , response ) :
         print ( response . css ( 'title::text' ) . extract_first ( ) )
 
 
# -*- coding: utf-8 -*-
import scrapy
 
 
class SeoSpider ( scrapy . Spider ) :
     name = 'seo'
     allowed_domains = [ 'www.168seo.cn' ]
     start_urls = [ 'http://www.168seo.cn/' ]
 
     def parse ( self , response ) :
         print ( response . css ( 'title::text' ) . extract_first ( ) )

创建一个新的脚本main.py

main.py 文件代码如下:

Python
# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from scrapy.crawler import CrawlerProcess class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) process = CrawlerProcess() process.crawl(SeoSpider) process.crawl(Seo2Spider) process.start() # the script will block here until all crawling jobs are finished
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# -*- coding: utf-8 -*-
"""
@Time: 2018/1/22
@Author: songhao
@微信公众号: zeropython
@File: main.py
"""
import scrapy
from scrapy . crawler import CrawlerProcess
class SeoSpider ( scrapy . Spider ) :
name = 'seo'
allowed_domains = [ 'www.168seo.cn' ]
start_urls = [ 'http://www.168seo.cn/' ]
 
def parse ( self , response ) :
print ( response . css ( 'title::text' ) . extract_first ( ) )
 
class Seo2Spider ( scrapy . Spider ) :
name = 'seo2'
allowed_domains = [ 'www.168seo.cn' ]
start_urls = [ 'http://www.168seo.cn/' ]
 
def parse ( self , response ) :
print ( response . css ( 'title::text' ) . extract_first ( ) )
 
process = CrawlerProcess ( )
process . crawl ( SeoSpider )
process . crawl ( Seo2Spider )
process . start ( ) # the script will block here until all crawling jobs are finished

 

 

效果如下:

方法二:

Python
# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging # 爬虫1 class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # 爬虫2 class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) configure_logging() runner = CrawlerRunner() runner.crawl(SeoSpider) runner.crawl(Seo2Spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# -*- coding: utf-8 -*-
"""
@Time: 2018/1/22
@Author: songhao
@微信公众号: zeropython
@File: main.py
"""
import scrapy
from twisted . internet import reactor
from scrapy . crawler import CrawlerRunner
from scrapy . utils . log import configure_logging
 
# 爬虫1
class SeoSpider ( scrapy . Spider ) :
     name = 'seo'
     allowed_domains = [ 'www.168seo.cn' ]
     start_urls = [ 'http://www.168seo.cn/' ]
 
     def parse ( self , response ) :
         print ( response . css ( 'title::text' ) . extract_first ( ) )
 
# 爬虫2
class Seo2Spider ( scrapy . Spider ) :
     name = 'seo2'
     allowed_domains = [ 'www.168seo.cn' ]
     start_urls = [ 'http://www.168seo.cn/' ]
 
     def parse ( self , response ) :
         print ( response . css ( 'title::text' ) . extract_first ( ) )
 
 
configure_logging ( )
runner = CrawlerRunner ( )
runner . crawl ( SeoSpider )
runner . crawl ( Seo2Spider )
d = runner . join ( )
d . addBoth ( lambda _ : reactor . stop ( ) )
 
reactor . run ( ) # the script will block here until all crawling jobs are finished

效果和第一个差不多




  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值