立即学习:https://edu.youkuaiyun.com/course/play/24797/282224?utm_source=blogtoedu
爬取易车RAV4的数据 # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html import json class YichespiderPipeline(object): def __init__(self): # 初始化要写入的JSON self.json_file = open('carSpider.json', 'wb+') self.json_file.write('\n'.encode('UTF-8')) # 该方法的item就是蜘蛛yield的item对象 def process_item(self, item, spider): text = json.dumps(dict(item), ensure_ascii=False) + ',\n' self.json_file.write(text.encode("UTF-8")) # print('款式:', item['design']) # print('购买时间:', item['buy_date']) # print('地址:', item['buy_addr']) # print('裸车价:', item['real_price']) # print('指导价:', item['original_price']) def close_spider(self, spider): print('-------------------------关闭爬虫-------------------------------------') self.json_file.seek(-2, 1) self.json_file.write('\n'.encode("UTF-8")) self.json_file.close()
# -*- coding: utf-8 -*- import scrapy from YicheSpider.items import YichespiderItem class CarSpiderSpider(scrapy.Spider): name = 'car_spider' allowed_domains = ['luochejia.yiche.com'] # 从哪个页面开始爬 # urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=%s' % i for i in range(1, 62)] # start_urls = urls start_urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=1'] def parse(self, response): # 每个job_primary元素包含一个工作信息 for car_primary in response.xpath('//div[@class="price-list-box"]'): item = YichespiderItem() # 款式 item['design'] = car_primary.xpath('./div[@class="con-box"]/div[@class="tit"]/text()').extract_first() # 购买时间 item['buy_date'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first() # 地址 item['buy_addr'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first() # 裸车价 item['real_price'] = car_primary.xpath( './div[@class="con-box"]/div[@class="price"]/p[@class="luochejia"]/em/text()').extract_first() # 指导价 item['original_price'] = car_primary.xpath( './div[@class="con-box"]/div[@class="price"]/p[@class="zhidaojia"]/text()').extract_first() yield item next_page = response.xpath('//div[@class="pagination mbt20"]/div/a[@class="next-on"]/@href').extract() # 增加判断 countNum = 0 # 第一页往后一个 if next_page and len(next_page) > 1: new_link = next_page[1] print( '下一页地址:http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse) elif next_page and len(next_page) > 0 and countNum == 0: new_link = next_page[0] print( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++当前是第一页+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") print( '下一页地址:http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++') yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse) else: pass