以下代码是在python3.6环境下测试通过
#!/usr/bin/python
# -*- coding:utf-8 -*-
from scrapy.http import Request
from scrapy.spiders import Spider
from scrapy.selector import Selector
from storage.items import W3SchoolItem
class StorageSpider(Spider):
"""
有三个必需的定义的成员:name,start_urls,parse()
"""
name = "storage" #这个spider的标识
allowed_domains = ["www.zyc56.org.cn"] #域名限制
start_urls = [ #一个url列表,spider从这些网页开始抓取
"http://www.zyc56.org.cn/index.php?m=content&c=index&a=lists&catid=31"
]
def parse(self, response):
sel = Selector(response)
item = StorageItem()
mainXpath = sel.xpath('//div[@class="map_intro clear"]')
elseXpath = sel.xpath('//div[@class="map_article"]')
item['crawlUrl'] = response.url
item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称
item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人
item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话
item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型
item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间
item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区
item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址
sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库规模")]/following-sibling::td[position()=1]')
if not len(sonPath): #空数组
sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库建设方案")]/../following-sibling::tr/td[position()=2]')
item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"\xa0",""),"平米",""))').extract()
if len(item['enterpriseName']):
yield item
alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract()
for alink in alinkList:
yield Request(url=alink, callback=self.parse)
pipelines.py 文件代码如下:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#from scrapy.exporters import JsonItemExporter
import pymysql
class StoragePipeline(object):
# def open_spider(self, spider):
# #可选实现,当spider被开启时,这个方法被调用。
# #输出到 w3school_data_utf8.json 文件
# self.file = open('w3school_data_utf8.json', 'wb')
# self.exporter = JsonItemExporter(self.file, encoding='utf-8')
# self.exporter.start_exporting()
#
# def close_spier(self, spider):
# #可选实现,当spider被关闭时,这个方法被调用
# self.exporter.finish_exporting()
# self.file.close()
#
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
def __init__(self):
self.dbpool = pymysql.connect(
host = '127.0.0.1',
db = 'db_scrapy',
user = 'root',
passwd = 'abc123',
charset = 'utf8'
)
def process_item(self, item, spider):
db = self.dbpool
cur = db.cursor()
try:
cur.execute("insert into storage_info(enterprise_name, warehouse_address, warehouse_detail_addr, warehouse_size,warehouse_type, contact_user, contact_number, release_time, add_type, crawl_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", #release_time,
(
item['enterpriseName'][0][5:],
item['warehouseAddress'][0],
item['warehouseDetailAddr'][0].strip()[5:],
item['warehouseSize'][0].strip(),
item['warehouseType'][0][5:],
item['contactUser'][0][4:],
item['contactNumber'][0],
item['releaseTime'][0][3:],
1,
item['crawlUrl']
)
)
db.commit()
except Exception as e:
print('错误',format(e))
db.rollback()
db.close()
return item
items.py 文件代码如下:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class StorageItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
enterpriseName = scrapy.Field()
warehouseAddress = scrapy.Field()
warehouseDetailAddr = scrapy.Field()
warehouseSize = scrapy.Field()
warehouseType = scrapy.Field()
releaseTime = scrapy.Field()
contactUser = scrapy.Field()
contactNumber = scrapy.Field()
addType = scrapy.Field()
crawlUrl = scrapy.Field()
settings.py 文件代码需修改如下配置:
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'storage.pipelines.StoragePipeline': 300,
}