爬取名人名言(http://quotes.toscrape.com)
import scrapy
from scrapy.spiders import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
class Quotes(CrawlSpider):
name = 'quote'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/']
rules = (
Rule(LinkExtractor(allow='/page/\d+'), callback='parse_quotes',follow=True),
Rule(LinkExtractor(allow='/author/\w+'), callback='parse_author')
)
def parse_quotes(self,response):
for quote in response.css('.quote'):
yield {
'content':quote.css('.text::text').extract_first(),
'author':quote.css('.author::text').extract_first(),
'tegs':quote.css('.tag::text').extract()
}
def parse_author(self,response):
name = response.css('.author-title::text').extract_first()
author_born_date = response.css('.author_born_date::text').extract_first()
author_born_location = response.css('.author_born_location::text').extract_first()
author_description = response.css('.author_description::text').extract_first()
return ({
'name':name,
'author_born_date':author_born_date,
'author_born_location':author_born_location,
'author_description':author_description
})