简述
Item在Spider中被收集后,会传入Item pipeline中进行处理。Item pipeline当中的类负责接收Item,并对item执行一些操作(清理,丢弃,验证,去重,存数据库等)。
在setting.py文件中进行配置
# mongodb数据库相关连接
MONGO_DB_URI = '127.0.0.1'
MONGO_DB_PORT = 27017
MONGO_DB_NAME = 'doubanDetail'
# Configure item pipelines
ITEM_PIPELINES = {
'douban.pipelines.MongoDBPipleline': 300,
}
在pipeline.py文件中创建类MongoDBPipleline
from douban.items import DoubanItem
import pymongo
class DoubanPipeline(object):
def process_item(self, item, spider):
return item
class MongoDBPipleline(object):
"""将item写入MongoDB"""
def __init__(self):
"""获取mongoDB的连接,获取数据库doubanDetail,获取集合newBookDetail"""
client = pymongo.MongoClient('localhost', 27017)
db = client.doubanDetail
self.newBookDetail = db.newBookDetail
@classmethod
def from_crawler(cls, crawler):
cls.DB_URL = crawle