scrapy 采集常用的pipeline,备份一下
import json
import pymysql
from scrapy import Request
from twisted.enterprise import adbapi
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class YwnamePipeline:
def process_item(self, item, spider):
return item
class myEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, bytes):
return str(obj,encoding=‘utf-8’)
return json.JSONEncoder.default(self, obj)
class FilePipeline(object):
def init(self):
self.f = open(“ywname.json”, “w”,encoding=‘utf-8’)
self.f.write("[")
def process_item(self, item, spider):
# dict 列表转成字典,再转成json
text = json.dumps(dict(item), ensure_ascii=False, cls=myEncoder) + “,\n”
# text = json.dumps(dict(item), ensure_ascii=False) + “\n”
self.f.write(text)
return item
def close_spider(self, spider):
self.f.write("]")
self.f.close()
class DownimagesPipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = "https://haotingde.com/"url.split(’/’)[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x[‘path’] for ok, x in results if ok]
if not image_paths:
raise DropItem(‘Image Downloaded Failed’)
return item
def get_media_requests(self, item, info):
yield Request(item[‘url’])
转自: https://912616.com/app/python/307.html
本文介绍了如何使用Scrapy框架中的几种管道,如YwnamePipeline、FilePipeline和DownimagesPipeline,实现数据的处理、JSON格式化和图片下载的备份。重点讲解了如何将采集的数据转化为JSON文件并保存,以及如何处理图片下载失败的情况。
3万+

被折叠的 条评论
为什么被折叠?



