学习笔记(45):21天通关Python（仅视频课）-导出爬取的数据-优快云博客

本文链接：https://blog.youkuaiyun.com/happyk213/article/details/105413684

立即学习:https://edu.youkuaiyun.com/course/play/24797/282224?utm_source=blogtoedu

爬取易车RAV4的数据
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

import json


class YichespiderPipeline(object):
    def __init__(self):
        # 初始化要写入的JSON
        self.json_file = open('carSpider.json', 'wb+')
        self.json_file.write('\n'.encode('UTF-8'))

    # 该方法的item就是蜘蛛yield的item对象
    def process_item(self, item, spider):
        text = json.dumps(dict(item), ensure_ascii=False) + ',\n'
        self.json_file.write(text.encode("UTF-8"))
        # print('款式：', item['design'])
        # print('购买时间：', item['buy_date'])
        # print('地址：', item['buy_addr'])
        # print('裸车价：', item['real_price'])
        # print('指导价：', item['original_price'])

    def close_spider(self, spider):
        print('-------------------------关闭爬虫-------------------------------------')
        self.json_file.seek(-2, 1)
        self.json_file.write('\n'.encode("UTF-8"))
        self.json_file.close()

# -*- coding: utf-8 -*-
import scrapy
from YicheSpider.items import YichespiderItem


class CarSpiderSpider(scrapy.Spider):
    name = 'car_spider'
    allowed_domains = ['luochejia.yiche.com']
    # 从哪个页面开始爬
    # urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=%s' % i for i in range(1, 62)]
    # start_urls = urls
    start_urls = ['http://luochejia.yiche.com/yiqifengtianrav4/price/?page=1']

    def parse(self, response):
        # 每个job_primary元素包含一个工作信息
        for car_primary in response.xpath('//div[@class="price-list-box"]'):
            item = YichespiderItem()
            # 款式
            item['design'] = car_primary.xpath('./div[@class="con-box"]/div[@class="tit"]/text()').extract_first()
            # 购买时间
            item['buy_date'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first()
            # 地址
            item['buy_addr'] = car_primary.xpath('./div[@class="con-box"]/p[@class="other"]/text()').extract_first()
            # 裸车价
            item['real_price'] = car_primary.xpath(
                './div[@class="con-box"]/div[@class="price"]/p[@class="luochejia"]/em/text()').extract_first()
            # 指导价
            item['original_price'] = car_primary.xpath(
                './div[@class="con-box"]/div[@class="price"]/p[@class="zhidaojia"]/text()').extract_first()
            yield item

        next_page = response.xpath('//div[@class="pagination mbt20"]/div/a[@class="next-on"]/@href').extract()
        # 增加判断
        countNum = 0
        # 第一页往后一个
        if next_page and len(next_page) > 1:
            new_link = next_page[1]
            print(
                '下一页地址：http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
            yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse)

        elif next_page and len(next_page) > 0 and countNum == 0:
            new_link = next_page[0]
            print(
                "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++当前是第一页+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
            print(
                '下一页地址：http://luochejia.yiche.com' + new_link + '+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
            yield scrapy.Request('http://luochejia.yiche.com' + new_link, callback=self.parse)
        else:
            pass