###实现代码 # -*- coding: utf-8 -*- import scrapy from ..items import ShengshiItem from scrapy_redis.spiders import RedisSpider class ShengshiSSpider(RedisSpider): name = 'shengshi_s' # allowed_domains = ['sheng-shi.com'] redis_key = "ShengshiSSpider:start_urls" # 爬取的URL元祖/列表。爬虫从这里开始抓取数据,所以,第一次下载的数据将会从这些urls开始。 # start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'] def parse(self, response): first_city = response.xpath('//tr[@class="provincetr"]/td/a') for city in first_city: f = city.xpath('./text()').extract_first() second_href = city.xpath('./@href').extract_first() if second_href is not None: new_url = response.urljoin(second_href) yield scrapy.Request(new_url, callback=self.city2, meta={'f': f},dont_filter=True) def city2(self, response): second_citys = response.xpath('//tr[@class="citytr"]/td[2]/a') for second_city in second_citys: se = second_city.xpath('./text()').extract_first() f = response.meta['f'] third_href = second_city.xpath('./@href').extract_first() if third_href is not None: new_url = response.urljoin(third_href) yield scrapy.Request(new_url, callback=self.city3, meta={'f': f, 'se': se},dont_filter=True) def city3(self, response): third_citys = response.xpath('//tr[@class="countytr"]/td[2]/a') for third_city in third_citys: th = third_city.xpath('./text()').extract_first() # print(th) f = response.meta['f'] se = response.meta['se'] fourth_href = third_city.xpath('./@href').extract_first() # print(fourth_href) if fourth_href is not None: new_url = response.urljoin(fourth_href) yield scrapy.Request(new_url, callback=self.city4, meta={'f': f, 'se': se, 'th': th},dont_filter=True) def city4(self, response): fourth_citys = response.xpath('//tr[@class="towntr"]/td[2]/a') for fourth_city in fourth_citys: fo = fourth_city.xpath('./text()').extract_first() f = response.meta['f'] se = response.meta['se'] th = response.meta['th'] fifth_href = fourth_city.xpath('./@href').extract_first() if fifth_href is not None: new_url = response.urljoin(fifth_href) yield scrapy.Request(new_url, callback=self.city5, meta={'f': f, 'se': se, 'th': th, 'fo': fo}, dont_filter=True) def city5(self, response): fifth_citys = response.xpath('//tr[@class="villagetr"]/td[3]/text()').extract() for fifth_city in fifth_citys: item = ShengshiItem() f = response.meta['f'] se = response.meta['se'] th = response.meta['th'] fo = response.meta['fo'] item['first_city'] = f item['second_city'] = se item['third_city'] = th item['fourth_city'] = fo item['fifth_city'] = fifth_city yield item
##setting设置
#去重组件,在redis数据库里做去重操作 DUPEFILTER_CLASS="scrapy_redis.dupefilter.RFPDupeFilter" #使用scrapy_redis的调度器,在redis里分配请求 SCHEDULER="scrapy_redis.scheduler.Scheduler" # 是否在开始之前清空 调度器和去重记录,True=清空,False=不清空 SCHEDULER_FLUSH_ON_START = False # 去调度器中获取数据时,如果为空,最多等待时间(最后没数据,未获取到)。 SCHEDULER_IDLE_BEFORE_CLOSE = 10 #是否在关闭时候保留原来的调度器和去重记录,True=保留,False=清空 SCHEDULER_PERSIST=True #将数据存储到redis数据库里 # ITEM_PIPELINES = { # 'scrapy_redis.pipelines.RedisPipeline':300 # } #服务器地址 REDIS_HOST='127.0.0.1' #端口 REDIS_PORT=6379 #存到redis的编码格式 #REDIS_ENCODING="UTF-8"
##item字段
import scrapy class ShengshiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() first_city = scrapy.Field() second_city = scrapy.Field() third_city = scrapy.Field() fourth_city = scrapy.Field() fifth_city = scrapy.Field()
##pipelines :存储到MongoDB中
import pymongo class ShengshiPipeline(object): def __init__(self): self.d = pymongo.MongoClient('localhost') self.db = self.d['shengshi2'] def process_item(self, item, spider): self.db['liandong'].insert(dict(item)) return item