import scrapy
classPachongScrapyItem(scrapy.Item):# define the fields for your item here like:# name = scrapy.Field()#想爬的内容
title = scrapy.Field()
title_link = scrapy.Field()#容器存储
comment = scrapy.Field()
spider/dd.py
import scrapy
from pachong_scrapy.items import PachongScrapyItem
from scrapy.http import Request
classDdSpider(scrapy.Spider):
name ='dd'
allowed_domains =['dangdang.com']
start_urls =['http://category.dangdang.com/pg1-cid10010337.html']#第一页defparse(self, response):
item = PachongScrapyItem()
item["title"]= response.xpath("//a[@name='itemlist-title']/@title").extract()#Xpath表达式 extract提取
item["title_link"]= response.xpath("//a[@name='itemlist-title']/@href").extract()
item["comment"]= response.xpath("//a[@name='itemlist-review']/text()").extract()# print(item['title'], item["title_link"], item["comment"])yield item
for i inrange(2,81):#多页爬虫
url ='http://category.dangdang.com/pg'+str(i)+'-cid10010337.html'yield Request(url, callback=self.parse)#设置回调函数
piplines
classPachongScrapyPipeline(object):defprocess_item(self, item, spider):for i inrange(len(item['title'])):try:
title = item['title'][i]
title_link = item['title_link'][i]
comment = item['comment'][i]print('{}:{}:{}'.format(title, title_link, comment))except Exception as err:print(err)return item