scrapy抓取企业名录

我们要用scrapy抓取企业名录网站的企业信息,并且保存在mysql数据库中,数据大概是22万条,我们用scrapy抓取。

第一步,现在item中定义好要抓取的字段

import scrapy


class RepairSpiderItem(scrapy.Item):
    city_name = scrapy.Field()
    area_name = scrapy.Field()
    company_name = scrapy.Field()
    company_address = scrapy.Field()
    phone = scrapy.Field()
    mobile_phone = scrapy.Field()

第二步,定义spider的抓取逻辑

 

    def start_requests(self):
        url = 'http://xiu.iqixiu.cn/'
        yield Request(url=url,callback=self.parse)

    def parse(self, response):
        html1 = etree.HTML(response.text)
        tr_list = html1.xpath('/html/body/table/tbody/tr')  # 得到每一行的元素
        for tr in tr_list:  # 遍历每一行
            td_list = tr.xpath('./td[2]/font')  # 去除
            for td in td_list:
                href = td.xpath('./a/@href')[0]
                print('href:', href)
                href_url = 'http://xiu.iqixiu.cn/' + str(href.replace('{', '%7B').replace('}', '%7D'))
                print('href_url:', href_url)
                yield Request(url=href_url, callback=self.parse_dail)
    def parse_dail(self,response):
        html = etree.HTML(response.text)
        tr_list = html.xpath(r'//table/tbody/tr')
        for tr in tr_list[2:]:
            # 获取这一行所有得文本内容
            name_tr = tr.xpath('string()')
            # 将这一行字符串处理,得到一个列表,这样我们可以得到比如地址,电话这些信息,直接用列表下标取出来就可以了
            name_list = name_tr.replace(' ', '').replace('\r', '').split('\n')
            print(name_list)
            # 去除里面得文本属性值,并字符串处理。因为列表里得文本不全,这个文本属性值比较全
            print(tr.xpath('./td[4]/@*')[-1].replace("'", ''))
            if '名称' in tr_list[1].xpath('string()').replace(' ', '').replace('\r', '').split('\n')[3]:
                city_name = tr_list[0].xpath('string()').split('')[0]+''
                area_name = name_list[2]
                if tr.xpath('./td[3]/@*')[-1].replace("'", '') == 'nowrap':
                    company_name = name_list[3]
                else:
                    company_name = tr.xpath('./td[3]/@*')[-1].replace("'", '')
                company_address = name_list[4]
                phone = name_list[5]
                mobile_phone = name_list[6]
                #实例化字典存储对象
                Repair_item= RepairSpiderItem()
                Repair_item['city_name'] = city_name
                Repair_item['area_name'] = area_name
                Repair_item['company_name'] = company_name
                Repair_item['company_address'] = company_address
                Repair_item['phone'] = phone
                Repair_item['mobile_phone'] = mobile_phone
                yield Repair_item
            else:
                city_name = name_list[2]
                area_name = name_list[3]
                company_name = tr.xpath('./td[4]/@*')[-1].replace("'", '')
                company_address = name_list[5]
                phone = name_list[6]
                mobile_phone = name_list[7]
                # 实例化字典存储对象
                Repair_item = RepairSpiderItem()
                Repair_item['city_name'] = city_name
                Repair_item['area_name'] = area_name
                Repair_item['company_name'] = company_name
                Repair_item['company_address'] = company_address
                Repair_item['phone'] = phone
                Repair_item['mobile_phone'] = mobile_phone
                yield Repair_item

 

第三步,在setting里把保存的管道打通。也就是把默认的67,68行被注释的给解开

ITEM_PIPELINES = {
   'repair_spider.pipelines.RepairSpiderPipeline': 300,
}

第四步,在pipelines中定义存储

import pymysql


class RepairSpiderPipeline(object):
    def __init__(self):
        self.coon = pymysql.connect(host='193.112.180.37', user='root', password='***********', port=3306, db='*********')
        self.cursor = self.coon.cursor()
        self.cursor.execute("create table IF NOT EXISTS klkl_Service_shop(city_name char(50), area_name char(50), company_name char(100), company_address char(100), phone char(50), mobile_phone char(50));")
        self.coon.commit()

    def process_item(self, item, spider):
        sql = 'insert into klkl_Service_shop(city_name,area_name,company_name,company_address,phone,mobile_phone) values (%s,%s,%s,%s,%s,%s)'
        try:
            self.coon.ping(reconnect=True)
            self.cursor.execute(sql, (item['city_name'], item['area_name'], item['company_name'], item['company_address'], item['phone'],item['mobile_phone']))
            self.coon.commit()
            print('klkl_Service_shop提交成功')
        except:
            self.coon.rollback()
            print('klkl_Service_shop提交失败')
        return item

 

转载于:https://www.cnblogs.com/chaojiyingxiong/p/10362442.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值