scrapy下载图片
使用scrapy下载漂亮的小姐姐图片
直接上代码:
items.py文件中
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MzituprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 图片的名字
img_name = scrapy.Field()
# 图片的url地址
img_url = scrapy.Field()
# 发布时间
publish_time = scrapy.Field()
mzitu.py文件中:
import scrapy
from mzituproject.items import MzituprojectItem
import os
class MzituSpider(scrapy.Spider):
name = 'mzitu'
allowed_domains = ['www.mzitu.com']
start_urls = ['https://www.mzitu.com/']
url = 'https://www.mzitu.com/page/{}/'
page = 1
def parse(self, response):
# 获取所有的li标签
lilist = response.xpath('//ul[@id="pins"]/li')
for oli in lilist:
item = MzituprojectItem()
# 获取图片的url地址
img_url = oli.xpath('.//a/img/@data-original').extract_first()
# 获取图片的名字
img_name = oli.xpath('.//a/img/@alt').extract_first()
# 获取发布的时间
publish_time = oli.xpath('.//span[2]/text()').extract_first()
item['img_name'] = img_name
item['img_url'] = img_url
item['publish_time'] = publish_time
yield item
if self.page<=253:
self.page += 1
# 另外的地址拼接
url = self.url.format(self.page)
# 向另一页发送请求
yield scrapy.Request(url=url,callback=self.parse)
piplines.py文件中:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import json
import os
# import urllib.request
import requests
from itemadapter import ItemAdapter
class MzituprojectPipeline:
def open_spider(self,spider):
self.fp = open('mzitu.txt','w',encoding='utf8')
def process_item(self, item, spider):
self.download(item)
obj = dict(item)
self.fp.write(json.dumps(obj,ensure_ascii=False)+'\n')
return item
def download(self,item):
dirpath = 'mzitu'
# 创建一个名为mzitu的文件夹
if not os.path.exists(dirpath): # 判断文件是否存在
os.mkdir(dirpath)
# 生成一个文件名
filename = os.path.basename(item['img_url'])
# 拼接下载图片的路径
filepath = os.path.join(dirpath,filename)
# 使用requests下载图片,用urllib.request.urlretrieve()报HttpError403其实就是没有User-Agent对象
resp = requests.get(url=item['img_url'],headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.64'
})
with open(filepath,'wb') as fp:
fp.write(resp.content)
def close_spider(self,spider):
self.fp.close()
配置文件以前在scrapy框架这篇博客也配置过,这里就不在这里赘述,如果不了解scrapy框架,建议先学一下scrapy框架的基础知识。
本文展示了如何利用Scrapy框架从网站www.mzitu.com抓取并下载图片。首先定义了Scrapy项目中的items,包括图片名字、URL和发布时间字段。接着在Spider中解析响应,获取每个图片的详细信息,并通过Request发送请求到下一页。最后,在pipelines中处理图片下载,将图片保存到本地,并将相关信息写入txt文件。
1493

被折叠的 条评论
为什么被折叠?



