流程框架
在这个案例中,我们使用scrapy抓取图片之间某个模块的图片,整个框架的工作流程如上图所示。橘色线表示scrapy对初始url所在的服务器发起请求,绿色线表示scrapy对子页面url所在的服务器发起请求,黑色线表示scarpy下载图片的流程。
对应的步骤如下:
(1-1)engine从spider中提取初始url的Request对象,随后将该对象放入schedule中;
(1-2)某时刻,engine将该Request对象从schedule中提取出来,发送至downloader中;
(1-3)downloader完成数据请求,获取的相应封装为Response对象,将Response对象发送至engine;
(1-4)engine将Response对象发送至spider,spider中的parse方法完成Response对象的解析;
(2-1)因为解析出来子页面的url,所以spider将子页面的url封装成Request对象并将其提交至engine,接下来的流程就是重复(1-2)、(1-3);
(2-4)engine将子页面url对应的Response对象发送至spider,由于初始url和子页面url对应的html不一样,所以新建一个Response对象解析方法parse_detail,得到图片下载地址。
(3-1)由于下载图片属于数据的持久化保存,engine通过item将图片下载链接发送至pipeline中,在pipeline中实现图片的下载。(item是scrapy中一种传递数据的格式,我们可以将其看作字典。)
创建runner.py用于scrapy的运行
from scrapy.cmdline import execute
if __name__ == '__main__':
execute("scrapy crawl imageDownload".split())
scrapy主体代码
(1)spider
import scrapy
from picture.items import PictureItem
'''
(1)创建runner文件,方便我们运行imageDownload爬虫程序
(2)修改settings.py文件
修改告警级别:LOG_LEVEL = "WARNING"
修改robots协议:ROBOTSTXT_OBEY = False
修改爬虫程序请求响应之间的等待时间:DOWNLOAD_DELAY = 3
(3)修改start_uls,scrapy框架自动生成的,很有可能不是我们需要的起始url
目的:获取图片之间(‘https://www.tpzj.com/’)中娱乐八卦中的男明星专辑
'''
class ImagedownloadSpider(scrapy.Spider):
name = "imageDownload"
allowed_domains = ["tpzj.com"]
start_urls = ["https://www.tpzj.com/"]
def parse(self, response, **kwargs): # parse方法默认处理其一次响应的方法
ls = response.xpath('//div[@class="warpbox bgff"][1]//div[@class="list_menu"]/a[6]')
for l in ls:
href = l.xpath('./@href').extract_first()
name = l.xpath('./text()').extract_first()
# 拼接子url的地址
# https://www.tpzj.com/mingxing/xiezhen/
# https://www.tpzj.com/
child_url = response.urljoin(href) # scrapy中内置的方法实现url地址的拼接
yield scrapy.Request(
url=child_url,
method="GET",
callback=self.parse_detail # 回调函数,指定网页解析的方法
)
def parse_detail(self, response, **kwargs):
lis = response.xpath('//div[@id="tage9b611f8ab43453e519443ccb9a97ac0"]/li')
for li in lis:
src_url = li.xpath('./a/img/@src').extract_first()
name = li.xpath('./a/@title').extract_first()
# print(src_url)
# print(name)
item = PictureItem() #声明item对象
item['src'] = src_url
item['name'] = name
yield item # 将数据传递给管道
(2)item
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PictureItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
src = scrapy.Field()
name = scrapy.Field()
(3)pipeline
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline # ImagesPipeline时scrapy中内置的专门用于保存图片的类
from scrapy.pipelines.files import FilesPipeline # FilesPipeline是scrapy中专门用于保存文件的类,两个类的使用方法相同
import requests
class PicturePipeline:
def process_item(self, item, spider):
return item
# 在settings.py中启用该管道
# 创建新的类,让他继承ImagesPipeline
'''
settings.py中的设置如下
ITEM_PIPELINES = {
"picture.pipelines.PicturePipeline": 300,
"picture.pipelines.PicturePipeline1":299, #数据先进入该管道
}
IMAGES_STORE = "./images" # 配置图片保存的位置
'''
class PicturePipeline1(ImagesPipeline):
# def get_media_requests(self, item, info):
# yield scrapy.Request(item['src'],meta={'name':item['name']}) # scrapy中默认的发请求的方法,仅限于图片和文件的获取。
# # 如果觉得上述获取图片或者文件的方法不好理解,我们在这里可以自己使用requests获取图片
# def file_path(self, request, response=None, info=None, *, item=None):
# title = request.meta['title']+'.jpg'
# return title
# def item_completed(self, results, item, info):
# print(results)
def process_item(self, item, spider):
response = requests.get(url=item['src'])
print(response.content)
return item
(4)settings
# Scrapy settings for picture project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "picture"
SPIDER_MODULES = ["picture.spiders"]
NEWSPIDER_MODULE = "picture.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "picture (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "picture.middlewares.PictureSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# "picture.middlewares.PictureDownloaderMiddleware": 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"picture.pipelines.PicturePipeline": 300,
"picture.pipelines.PicturePipeline1":299, #数据先进入该管道
}
# IMAGES_STORE = "./images" # 配置图片保存的位置
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
LOG_LEVEL = "WARNING"