流程如下
-
爬虫文件中,也就是执行scrapy genspider 爬虫名 域名 后建的py文件
-
# -*- coding: utf-8 -*- import scrapy from ..items import LnzbItem class LnzxzbSpider(scrapy.Spider): name = 'lnzxzb' # allowed_domains = ['baidu.com'] start_urls = ['http://www.lnzxzb.cn/gcjyxx/004001/%s.html'%page for page in range(2,101)] def parse(self, response): # print(response) li_list = response.xpath('//ul[@id="showList"]/li') # print(li_list) for li in li_list: title = li.xpath('./p/a/@title').extract_first() date = li.xpath('./span/text()').extract_first() # print(title) # print(date) # print('-'*50) item = LnzbItem() item['title'] = title item['date'] = date yield item
-
管道文件中,pipelines.py
import pymongo
class LnzbPipeline(object):
conn = pymongo.MongoClient('localhost',27017)
db = conn.mm
table = db.lnzxzb
def process_item(self, item, spider):
self.table.insert_one(dict(item))
return item
- items 文件中
import scrapy
class LnzbItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
date = scrapy.Field()
- settings配置文件中需要把管道注释打开、UA检测打开,是否遵循robots协议关闭