首先创建相应的爬虫
spider 爬虫文件修改成这样
# -*- coding: utf-8 -*- import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span> class Seo2Spider(<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span>.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # -*- coding: utf-8 -*- import <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/scrapy" title="View all posts in scrapy" target="_blank">scrapy</a></span> class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first())
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
# -*- coding: utf-8 -*-
import
scrapy
class
Seo2Spider
(
scrapy
.
Spider
)
:
name
=
'seo2'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
# -*- coding: utf-8 -*-
import
scrapy
class
SeoSpider
(
scrapy
.
Spider
)
:
name
=
'seo'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
|
创建一个新的脚本main.py
main.py 文件代码如下:
# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from scrapy.crawler import CrawlerProcess class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) process = CrawlerProcess() process.crawl(SeoSpider) process.crawl(Seo2Spider) process.start() # the script will block here until all crawling jobs are finished
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
# -*- coding: utf-8 -*-
"""
@Time: 2018/1/22
@Author: songhao
@微信公众号: zeropython
@File: main.py
"""
import
scrapy
from
scrapy
.
crawler
import
CrawlerProcess
class
SeoSpider
(
scrapy
.
Spider
)
:
name
=
'seo'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
class
Seo2Spider
(
scrapy
.
Spider
)
:
name
=
'seo2'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
process
=
CrawlerProcess
(
)
process
.
crawl
(
SeoSpider
)
process
.
crawl
(
Seo2Spider
)
process
.
start
(
)
# the script will block here until all crawling jobs are finished
|
效果如下:
方法二:
# -*- coding: utf-8 -*- """ @Time: 2018/1/22 @Author: songhao @微信公众号: zeropython @File: main.py """ import scrapy from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging # 爬虫1 class SeoSpider(scrapy.Spider): name = 'seo' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) # 爬虫2 class Seo2Spider(scrapy.Spider): name = 'seo2' allowed_domains = ['www.168seo.cn'] start_urls = ['http://www.168seo.cn/'] def parse(self, response): print(response.css('title::text').extract_first()) configure_logging() runner = CrawlerRunner() runner.crawl(SeoSpider) runner.crawl(Seo2Spider) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() # the script will block here until all crawling jobs are finished
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
|
# -*- coding: utf-8 -*-
"""
@Time: 2018/1/22
@Author: songhao
@微信公众号: zeropython
@File: main.py
"""
import
scrapy
from
twisted
.
internet
import
reactor
from
scrapy
.
crawler
import
CrawlerRunner
from
scrapy
.
utils
.
log
import
configure_logging
# 爬虫1
class
SeoSpider
(
scrapy
.
Spider
)
:
name
=
'seo'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
# 爬虫2
class
Seo2Spider
(
scrapy
.
Spider
)
:
name
=
'seo2'
allowed_domains
=
[
'www.168seo.cn'
]
start_urls
=
[
'http://www.168seo.cn/'
]
def
parse
(
self
,
response
)
:
print
(
response
.
css
(
'title::text'
)
.
extract_first
(
)
)
configure_logging
(
)
runner
=
CrawlerRunner
(
)
runner
.
crawl
(
SeoSpider
)
runner
.
crawl
(
Seo2Spider
)
d
=
runner
.
join
(
)
d
.
addBoth
(
lambda
_
:
reactor
.
stop
(
)
)
reactor
.
run
(
)
# the script will block here until all crawling jobs are finished
|
效果和第一个差不多