创建项目
创建一个爬虫项目,爬取蘑菇街数据。
1 cd 到存放项目的目录下 2 scrapy startproject MogujiePro 3 在pycharm中打开项目 4 scrapy genspider mogu mogujie.com
文件介绍
- settings.py 文件
1 # Enable or disable downloader middlewares 2 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 3 DOWNLOADER_MIDDLEWARES = { 4 # 这个组件是 下载中间件,此时开启了自定义的下载中间件(自定义和系统原生-源码中) 5 # 下载中间件的作用:下载器向服务器发起请求的过程中会按照中间件的优先级顺序,分别将请求对象传递到每个中间件中进行相关的处理 6 'MogujiePro.middlewares.MogujieproDownloaderMiddleware': None, 7 # 使用selenium后,由于浏览器可以代替下面的中间件功能,可以禁掉一些用不到的系统默认的中间件 8 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None, 9 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware':None, 10 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware':None, 11 12 # 加入自定义的中间件 13 'MogujiePro.middlewares.SeleniumDownloaderMiddleware': 545, 14 }
- middleware.py 文件
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your spider middleware 4 # 5 # See documentation in: 6 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 8 from scrapy import signals 9 10 # 爬虫中间件。本身是一个普通类,当把其加载到中间件组件中,就变成了组件类 11 class MogujieproSpiderMiddleware(object): 12 # Not all methods need to be defined. If a method is not defined, 13 # scrapy acts as if the spider middleware does not modify the 14 # passed objects. 15 16 @classmethod 17 def from_crawler(cls, crawler): 18 # This method is used by Scrapy to create your spiders. 19 s = cls() 20 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 return s 22 23 def process_spider_input(self, response, spider): 24 # Called for each response that goes through the spider 25 # middleware and into the spider. 26 27 # Should return None or raise an exception. 28 return None 29 30 def process_spider_output(self, response, result, spider): 31 # Called with the results returned from the Spider, after 32 # it has processed the response. 33 34 # Must return an iterable of Request, dict or Item objects. 35 for i in result: 36 yield i 37 38 def process_spider_exception(self, response, exception, spider): 39 # Called when a spider or process_spider_input() method 40 # (from other spider middleware) raises an exception. 41 42 # Should return either None or an iterable of Request, dict 43 # or Item objects. 44 pass 45 46 def process_start_requests(self, start_requests, spider): 47 # Called with the start requests of the spider, and works 48 # similarly to the process_spider_output() method, except 49 # that it doesn’t have a response associated. 50 51 # Must return only requests (not items). 52 for r in start_requests: 53 yield r 54 55 def spider_opened(self, spider): 56 spider.logger.info('Spider opened: %s' % spider.name) 57 58 # 下载中间件。本身是一个普通类,当把其加载到中间件组件中,就变成了组件类 59 class MogujieproDownloaderMiddleware(object): 60 # Not all methods need to be defined. If a method is not defined, 61 # scrapy acts as if the downloader middleware does not modify the 62 # passed objects. 63 64 @classmethod 65 def from_crawler(cls, crawler): #类方法---创建了一个爬虫 66 # This method is used by Scrapy to create your spiders. 67 s = cls() 68 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 print("创建了一个爬虫!") 70 return s 71 72 def process_request(self, request, spider): # 有请求经过当前下载中间件时调用 73 print("我是process_request方法,有请求经过当前中间件的时候调用") 74 # Called for each request that goes through the downloader 75 # middleware. 76 print("Request对象为:",request) 77 print("spider对象为:",spider) 78 # Must either: 79 # - return None: continue processing this request 80 # - or return a Response object 81 # - or return a Request object 82 # - or raise IgnoreRequest: process_exception() methods of 83 # installed downloader middleware will be called 84 return None 85 86 def process_response(self, request, response, spider): 87 print("我是process_response方法,当有响应经过当前中间件的时候调用") 88 # Called with the response returned from the downloader. 89 # Must either; 90 # - return a Response object 91 # - return a Request object 92 # - or raise IgnoreRequest 93 return response 94 95 def process_exception(self, request, exception, spider): 96 print("我是process_exception方法,当有异常的时候调用") 97 # Called when a download handler or a process_request() 98 # (from other downloader middleware) raises an exception. 99 100 # Must either: 101 # - return None: continue processing this exception 102 # - return a Response object: stops process_exception() chain 103 # - return a Request object: stops process_exception() chain 104 pass 105 106 def spider_opened(self, spider): 107 print("我是spider_opened方法,当有打开的时候调用") 108 spider.logger.info('Spider opened: %s' % spider.name)
在middleware.py 文件自定义一个中间件,用于植入selenium + webdriiver机制。将不能直接加载返回的数据用selenium加载出来
1 from scrapy import signals 2 3 from selenium import webdriver 4 from time import sleep 5 from scrapy.http import HtmlResponse 6 7 # 自定义一个中间件,用于植入selenium + webdriiver机制 8 class SeleniumDownloaderMiddleware(object): 9 10 # 重写请求过程处理方法 11 def process_request(self, request, spider): 12 print("我是selenium中间件!") 13 14 # 无头操作,浏览器界面不会被调起来 15 opt = webdriver.ChromeOptions() 16 opt.add_argument("--headless") 17 opt.add_argument("--disable-gpu") 18 driver = webdriver.Chrome(options=opt) 19 print("浏览器正在访问页面:",request.url) 20 driver.get(url=request.url) 21 sleep(1) 22 23 # 如果想获取更多内容,需要下拉加载 24 for i in range(1,100): 25 js = "document.documentElement.scrollTop=%d"%(i*100) 26 driver.execute_scirt(js) 27 sleep(1) 28 29 # 提取出driver中的响应体 30 body = driver.page_source 31 # 将响应体封装到一个响应对象中然后返回给下载器 32 return HtmlResponse(url=driver.current_url, body=body, encoding="utf-8", request=request)
- items.py 文件
1 # -*- coding: utf-8 -*- 2 # Define here the models for your scraped items 3 # See documentation in: 4 # https://docs.scrapy.org/en/latest/topics/items.html 5 6 import scrapy 7 8 # 此文件对待爬取的数据进行模型化,主要用于对接需求分析。有了这个文件后,格式化数据后就无需再spilder文件中创建字典 9 class MogujieproItem(scrapy.Item): 10 title = scrapy.Field() 11 org_price = scrapy.Field() 12 price = scrapy.Field() 13 pic = scrapy.Field()
- mogu.py 文件
1 # -*- coding: utf-8 -*- 2 import scrapy 3 from MogujiePro.items import MogujieproItem 4 5 6 class MoguSpider(scrapy.Spider): 7 name = 'mogu' 8 allowed_domains = ['mogujie.com'] 9 start_urls = ['https://list.mogu.com/book/skirt/50004?acm=3.mce.1_10_1lrec.128038.0.t8Z0frzoVgpJo.pos_1-m_507614-sd_119&ptp=31.v5mL0b.0.0.wyZwwn6Y'] 10 11 def parse(self, response): 12 # 使用scrapy的css进行解析selenium获取的json数据 13 goods_list = response.css(".iwf") # css语法和bs4基本一致 14 for goods in goods_list: 15 # 定义一个数据模型 16 item = MogujieproItem() 17 # item是一个字典对象 18 item["title"] = goods.css(".title::text").extract_first() # 获取内容值 19 item["org_price"] = goods.css(".org_price > span::text").extract_first() 20 item["price"] = goods.css(".price_info::text").extract_first() 21 item["pic"] = goods.css("[rel='nofollow']::attr('img-src')").extract_first() # 获取属性值 22 yield item
- 在 pipelines.py文件中编写存储代码
1 import redis 2 import json 3 4 class MogujieproPipeline(object): 5 6 def open_spider(self,spider): 7 self.rds = redis.StrictRedis(host="www.fnajainbo",port=6379,db=2) 8 9 def process_item(self, item, spider): 10 self.rds.lpush("mogujie",json.dumps(dict(item))) # 转json存redis 11 return item 12 13 def close_spider(self,spider): 14 pass