Python Scrapy 修改默认图片管道，更改缩略图文件名

最新推荐文章于 2023-06-27 10:43:33 发布

原创最新推荐文章于 2023-06-27 10:43:33 发布 · 560 阅读

4 ·

CC 4.0 BY-SA版权

文章标签：

#json #缩略图命名 #Scrapy #图片管道 #哈希值

Python 同时被 3 个专栏收录

17 篇文章

订阅专栏

爬虫程序

10 篇文章

订阅专栏

数据存储

3 篇文章

订阅专栏

本文介绍如何在Scrapy框架中自定义图片管道，通过修改缩略图文件名称，使用小说名作为图片命名，实现更有序的图片管理。

部署运行你感兴趣的模型镜像

在学习使用Scrapy图片管道的时候，书上没有给出修改下载的缩略图文件名称的方法。自己研究了下，搞定了，记下来。

要爬的是起点中文网：

items.py 中要爬的有这么几项：

import scrapy


class DownloadimageItem(scrapy.Item):

    # 小说名称
    title = scrapy.Field()
    # 小说作者
    author = scrapy.Field()
    # 小说类型
    type = scrapy.Field()
    # 图片 url
    image_urls = scrapy.Field()
    # 图片结果信息
    images = scrapy.Field()

根据网页结构，写出爬虫程序：

# -*- coding: utf-8 -*-
import scrapy
from downloadimage.items import DownloadimageItem

class GetimageSpider(scrapy.Spider):
    name = 'getimage'
    allowed_domains = ['qidian.com']
    start_urls = ['https://www.qidian.com/finish']

    def parse(self, response):
        novels = response.css('li[data-rid]')
        for novel in novels:
            item = DownloadimageItem()
            item['title'] = novel.css('li h4>a::text').extract_first()
            item['author'] = novel.css('li p.author>a.name::text').extract_first()
            item['type'] = novel.css('em + a::text').extract_first()
            item['image_urls'] =['http:' + novel.css('a[data-bid][data-eid]>img::attr(src)').extract_first()]
            yield item

爬取的结果如下：

下图是下载下来的缩略图文件：

设置文件中我是这么写的，用 DownloadimagePipeline 这个自定义的管道来实现 json 文件的打开、关闭、读写等操作。
用 NewImageNamePipeline 继承默认的 ImagesPipeline 管道，通过重写 ImagesPipeline 的几个方法来实现更改下载的图片名称。

ITEM_PIPELINES = {
    'downloadimage.pipelines.DownloadimagePipeline':300,
    'downloadimage.pipelines.NewImageNamePipeline':1,
}
IMAGES_STORE = "D:/Output/python/ScrapyProjects/downloadimage"
IMAGES_URLS_FIELD = 'image_urls'
IMAGES_RESULT_FIELD = 'images'
IMAGES_THUMBS = {
    'small':(80,80),
    'big':(300,300),
}

最关键的是这两个管道的代码：

import json
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline

class DownloadimagePipeline(object):

    def open_spider(self,spider):
        """启动爬虫时，打开 items.json 文件夹，准备写入数据"""
        self.file = open('qidian.json','w',encoding='utf-8')

    def close_spider(self,spider):
        """关闭爬虫时，关闭 items.json 文件"""
        self.file.close()

    def process_item(self, item, spider):
        """将抓取到的数据做 json 序列化存储"""
        line = json.dumps(dict(item),ensure_ascii=False) + "\n"
        self.file.write(line)
        return item

class NewImageNamePipeline(ImagesPipeline):

    def file_path(self, request,response=None,info=None):
        """修改 file_path 方法，使用小说名命名图片"""
        return u"full/{0}.{1}".format(request.meta['item']['title'],"jpg")

    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            # meta 参数可以把 item 传递过去
            yield Request(image_url,meta={'item':item})

    def thumb_path(self, request, thumb_id, response=None, info=None):
        """修改 thumb_path 方法，使用小说名命名缩略图的图片名称"""
        return 'thumbs/%s/%s.jpg' % (thumb_id, request.meta['item']['title'])

我把代码精简到极致了，不赘述了，看不懂的也可以拿来用，有问题的可以留言给我。如果你想问我怎么知道改 thumb_path 这个方法就能改缩略图的名字，我告诉你，我搜到默认的 ImagesPipeline 管道的源代码，找到 thumb_guid = hashlib.sha1(url).hexdigest() 这句，看的出来这是在取SHA1值，所以需要重写的就是这个。下面是 ImagesPipeline 管道的源代码，你也可以来试试，看能不能找出来。

class ImagesPipeline(FilesPipeline):
    """Abstract pipeline that implement the image thumbnail generation logic

    """

    MEDIA_NAME = 'image'
    MIN_WIDTH = 0
    MIN_HEIGHT = 0
    THUMBS = {}
    DEFAULT_IMAGES_URLS_FIELD = 'image_urls'
    DEFAULT_IMAGES_RESULT_FIELD = 'images'

    @classmethod
    def from_settings(cls, settings):
        cls.MIN_WIDTH = settings.getint('IMAGES_MIN_WIDTH', 0)
        cls.MIN_HEIGHT = settings.getint('IMAGES_MIN_HEIGHT', 0)
        cls.EXPIRES = settings.getint('IMAGES_EXPIRES', 90)
        cls.THUMBS = settings.get('IMAGES_THUMBS', {})
        s3store = cls.STORE_SCHEMES['s3']
        s3store.AWS_ACCESS_KEY_ID = settings['AWS_ACCESS_KEY_ID']
        s3store.AWS_SECRET_ACCESS_KEY = settings['AWS_SECRET_ACCESS_KEY']

        cls.IMAGES_URLS_FIELD = settings.get('IMAGES_URLS_FIELD', cls.DEFAULT_IMAGES_URLS_FIELD)
        cls.IMAGES_RESULT_FIELD = settings.get('IMAGES_RESULT_FIELD', cls.DEFAULT_IMAGES_RESULT_FIELD)
        store_uri = settings['IMAGES_STORE']
        return cls(store_uri)

    def file_downloaded(self, response, request, info):
        return self.image_downloaded(response, request, info)

    def image_downloaded(self, response, request, info):
        checksum = None
        for path, image, buf in self.get_images(response, request, info):
            if checksum is None:
                buf.seek(0)
                checksum = md5sum(buf)
            width, height = image.size
            self.store.persist_file(
                path, buf, info,
                meta={'width': width, 'height': height},
                headers={'Content-Type': 'image/jpeg'})
        return checksum

    def get_images(self, response, request, info):
        path = self.file_path(request, response=response, info=info)
        orig_image = Image.open(StringIO(response.body))

        width, height = orig_image.size
        if width < self.MIN_WIDTH or height < self.MIN_HEIGHT:
            raise ImageException("Image too small (%dx%d < %dx%d)" %
                                 (width, height, self.MIN_WIDTH, self.MIN_HEIGHT))

        image, buf = self.convert_image(orig_image)
        yield path, image, buf

        for thumb_id, size in self.THUMBS.iteritems():
            thumb_path = self.thumb_path(request, thumb_id, response=response, info=info)
            thumb_image, thumb_buf = self.convert_image(image, size)
            yield thumb_path, thumb_image, thumb_buf

    def convert_image(self, image, size=None):
        if image.format == 'PNG' and image.mode == 'RGBA':
            background = Image.new('RGBA', image.size, (255, 255, 255))
            background.paste(image, image)
            image = background.convert('RGB')
        elif image.mode != 'RGB':
            image = image.convert('RGB')

        if size:
            image = image.copy()
            image.thumbnail(size, Image.ANTIALIAS)

        buf = StringIO()
        image.save(buf, 'JPEG')
        return image, buf

    def get_media_requests(self, item, info):
        return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]

    def item_completed(self, results, item, info):
        if self.IMAGES_RESULT_FIELD in item.fields:
            item[self.IMAGES_RESULT_FIELD] = [x for ok, x in results if ok]
        return item

    def file_path(self, request, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, '
                          'please use file_path(request, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from image_key or file_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if file_key() or image_key() methods have been overridden
        if not hasattr(self.file_key, '_base'):
            _warn()
            return self.file_key(url)
        elif not hasattr(self.image_key, '_base'):
            _warn()
            return self.image_key(url)
        ## end of deprecation warning block

        image_guid = hashlib.sha1(url).hexdigest()  # change to request.url after deprecation
        return 'full/%s.jpg' % (image_guid)

    def thumb_path(self, request, thumb_id, response=None, info=None):
        ## start of deprecation warning block (can be removed in the future)
        def _warn():
            from scrapy.exceptions import ScrapyDeprecationWarning
            import warnings
            warnings.warn('ImagesPipeline.thumb_key(url) method is deprecated, please use '
                          'thumb_path(request, thumb_id, response=None, info=None) instead',
                          category=ScrapyDeprecationWarning, stacklevel=1)

        # check if called from thumb_key with url as first argument
        if not isinstance(request, Request):
            _warn()
            url = request
        else:
            url = request.url

        # detect if thumb_key() method has been overridden
        if not hasattr(self.thumb_key, '_base'):
            _warn()
            return self.thumb_key(url, thumb_id)
        ## end of deprecation warning block

        thumb_guid = hashlib.sha1(url).hexdigest()  # change to request.url after deprecation
        return 'thumbs/%s/%s.jpg' % (thumb_id, thumb_guid)

    # deprecated
    def file_key(self, url):
        return self.image_key(url)
    file_key._base = True

    # deprecated
    def image_key(self, url):
        return self.file_path(url)
    image_key._base = True

    # deprecated
    def thumb_key(self, url, thumb_id):
        return self.thumb_path(url, thumb_id)
    thumb_key._base = True

您可能感兴趣的与本文相关的镜像