# coding=utf-8
import scrapy
from qidian.items import QidianItem
class QiDian(scrapy.Spider):
name = 'qidian_spider'
start_urls = ['https://www.qidian.com/all']
def parse(self, response):
'''对网页发起请求,请求当前页面后,请求剩余页面'''
yield scrapy.Request(response.url, callback=self.item_parse)
item_links = ['https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}'.format(str(i)) for i in range(2, 5)]
for item_link in item_links:
yield scrapy.Request(item_link, callback=self.item_parse)
def item_parse(self, response):
'''解析出网页中每本书的url'''
urls = response.xpath('//*[@class="all-img-list cf"]/li/div/h4/a/@href').extract()
for url in urls:
yield scrapy.Request(response.urljoin(url), callback=self.book_parse)
def book_parse(self, response):
'''爬取数据'''
item = QidianItem()
book_name = response.xpath('//*[@class="book-info "]/h1/em/text()').extract()[0]
Author = response.xpath('//*[@class="book-info "]/h1/span/a/text()').extract()[0]
item['book_name'] = book_name
item['Author'] = Author
yield item