scrapy爬取链接后再爬取链接内容

最新推荐文章于 2024-04-17 17:00:53 发布

原创最新推荐文章于 2024-04-17 17:00:53 发布 · 3.1k 阅读

12 ·

CC 4.0 BY-SA版权

Scrapy 专栏收录该内容

9 篇文章

订阅专栏

本文介绍了一个使用Python Scrapy框架的爬虫实例，该爬虫用于抓取网站上的企业信息，包括公司名称、联系人、电话等，并将数据存储到MySQL数据库中。文章详细展示了爬虫的代码结构，包括请求发送、数据解析、数据存储等关键步骤。

以下代码是在python3.6环境下测试通过

#!/usr/bin/python  
# -*- coding:utf-8 -*-  
from scrapy.http import Request
from scrapy.spiders import Spider
from scrapy.selector import Selector
from storage.items import W3SchoolItem

class StorageSpider(Spider):
    """
    有三个必需的定义的成员:name,start_urls,parse()
    """
    name = "storage" #这个spider的标识
    allowed_domains = ["www.zyc56.org.cn"] #域名限制
    start_urls = [  #一个url列表，spider从这些网页开始抓取
        "http://www.zyc56.org.cn/index.php?m=content&c=index&a=lists&catid=31"
    ]
    
    def parse(self, response):
        sel = Selector(response)
        item = StorageItem()
        
        mainXpath = sel.xpath('//div[@class="map_intro clear"]')
        elseXpath = sel.xpath('//div[@class="map_article"]')
        
        item['crawlUrl'] = response.url
        item['enterpriseName'] = mainXpath.xpath('dl/dd[1]/text()').extract() #公司名称
        item['contactUser'] = mainXpath.xpath('dl/dd[2]/text()').extract() #联系人
        item['contactNumber'] = mainXpath.xpath('dl/dd[3]/b/text()').extract() #联系电话
        item['warehouseType'] = mainXpath.xpath('dl/dd[4]/text()').extract()#仓库类型
        item['releaseTime'] = mainXpath.xpath('dl/dt/span/text()').extract()#发布时间
         
        item['warehouseAddress'] = elseXpath.xpath('div/span/text()').extract() #所在地区
        item['warehouseDetailAddr'] = elseXpath.xpath('div/text()[2]').extract() #所在详细地址
         
        sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库规模")]/following-sibling::td[position()=1]')
        if not len(sonPath): #空数组
            sonPath = elseXpath.xpath('table/tbody/tr/td[contains(text(),"仓库建设方案")]/../following-sibling::tr/td[position()=2]')
        
        item['warehouseSize'] = sonPath.xpath('normalize-space(translate(translate(string(.),"\xa0",""),"平米",""))').extract()
        
        if len(item['enterpriseName']):
            yield item
        
        alinkList = sel.xpath('//dd[@class="intro"]/a/@href').extract()
        for alink in alinkList:
            yield Request(url=alink, callback=self.parse)

pipelines.py 文件代码如下：

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#from scrapy.exporters import JsonItemExporter
import pymysql

class StoragePipeline(object):

#    def open_spider(self, spider):
#        #可选实现，当spider被开启时，这个方法被调用。
#        #输出到 w3school_data_utf8.json 文件
#        self.file = open('w3school_data_utf8.json', 'wb')
#        self.exporter = JsonItemExporter(self.file, encoding='utf-8')
#        self.exporter.start_exporting()
#        
#    def close_spier(self, spider):
#        #可选实现，当spider被关闭时，这个方法被调用
#        self.exporter.finish_exporting()
#        self.file.close()
#    
#    def process_item(self, item, spider):
#        self.exporter.export_item(item)
#        return item
    def __init__(self):
        self.dbpool = pymysql.connect(
            host = '127.0.0.1',
            db = 'db_scrapy',
            user = 'root',
            passwd = 'abc123',
            charset = 'utf8'
        )
        
    def process_item(self, item, spider):
        db = self.dbpool
        cur = db.cursor()
        try: 
            cur.execute("insert into storage_info(enterprise_name, warehouse_address, warehouse_detail_addr, warehouse_size,warehouse_type, contact_user, contact_number, release_time, add_type, crawl_url) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", #release_time,
                (
                item['enterpriseName'][0][5:],
                item['warehouseAddress'][0],
                item['warehouseDetailAddr'][0].strip()[5:],
                item['warehouseSize'][0].strip(),
                item['warehouseType'][0][5:],
                item['contactUser'][0][4:],
                item['contactNumber'][0],
                item['releaseTime'][0][3:],
                1,
                item['crawlUrl']
                )
            )
            db.commit()
        except Exception as e:
            print('错误',format(e))
            db.rollback()
            db.close()
            
        return item

items.py 文件代码如下：

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class StorageItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    enterpriseName = scrapy.Field()
    warehouseAddress = scrapy.Field()
    warehouseDetailAddr = scrapy.Field()
    warehouseSize = scrapy.Field()
    warehouseType = scrapy.Field()
    releaseTime = scrapy.Field()
    contactUser = scrapy.Field()
    contactNumber = scrapy.Field()
    addType = scrapy.Field()
    crawlUrl = scrapy.Field()

settings.py 文件代码需修改如下配置：

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'storage.pipelines.StoragePipeline': 300,
}