scrapy抓取企业名录

最新推荐文章于 2023-02-15 10:23:43 发布

airangrong6572

最新推荐文章于 2023-02-15 10:23:43 发布

阅读量321

点赞数

CC 4.0 BY-SA版权

文章标签： python 数据库

原文链接：http://www.cnblogs.com/chaojiyingxiong/p/10362442.html

我们要用scrapy抓取企业名录网站的企业信息，并且保存在mysql数据库中，数据大概是22万条，我们用scrapy抓取。

第一步，现在item中定义好要抓取的字段

import scrapy


class RepairSpiderItem(scrapy.Item):
    city_name = scrapy.Field()
    area_name = scrapy.Field()
    company_name = scrapy.Field()
    company_address = scrapy.Field()
    phone = scrapy.Field()
    mobile_phone = scrapy.Field()

第二步，定义spider的抓取逻辑。

    def start_requests(self):
        url = 'http://xiu.iqixiu.cn/'
        yield Request(url=url,callback=self.parse)

    def parse(self, response):
        html1 = etree.HTML(response.text)
        tr_list = html1.xpath('/html/body/table/tbody/tr')  # 得到每一行的元素
        for tr in tr_list:  # 遍历每一行
            td_list = tr.xpath('./td[2]/font')  # 去除
            for td in td_list:
                href = td.xpath('./a/@href')[0]
                print('href:', href)
                href_url = 'http://xiu.iqixiu.cn/' + str(href.replace('{', '%7B').replace('}', '%7D'))
                print('href_url:', href_url)
                yield Request(url=href_url, callback=self.parse_dail)
    def parse_dail(self,response):
        html = etree.HTML(response.text)
        tr_list = html.xpath(r'//table/tbody/tr')
        for tr in tr_list[2:]:
            # 获取这一行所有得文本内容
            name_tr = tr.xpath('string()')
            # 将这一行字符串处理，得到一个列表，这样我们可以得到比如地址，电话这些信息，直接用列表下标取出来就可以了
            name_list = name_tr.replace(' ', '').replace('\r', '').split('\n')
            print(name_list)
            # 去除里面得文本属性值，并字符串处理。因为列表里得文本不全，这个文本属性值比较全
            print(tr.xpath('./td[4]/@*')[-1].replace("'", ''))
            if '名称' in tr_list[1].xpath('string()').replace(' ', '').replace('\r', '').split('\n')[3]:
                city_name = tr_list[0].xpath('string()').split('市')[0]+'市'
                area_name = name_list[2]
                if tr.xpath('./td[3]/@*')[-1].replace("'", '') == 'nowrap':
                    company_name = name_list[3]
                else:
                    company_name = tr.xpath('./td[3]/@*')[-1].replace("'", '')
                company_address = name_list[4]
                phone = name_list[5]
                mobile_phone = name_list[6]
                #实例化字典存储对象
                Repair_item= RepairSpiderItem()
                Repair_item['city_name'] = city_name
                Repair_item['area_name'] = area_name
                Repair_item['company_name'] = company_name
                Repair_item['company_address'] = company_address
                Repair_item['phone'] = phone
                Repair_item['mobile_phone'] = mobile_phone
                yield Repair_item
            else:
                city_name = name_list[2]
                area_name = name_list[3]
                company_name = tr.xpath('./td[4]/@*')[-1].replace("'", '')
                company_address = name_list[5]
                phone = name_list[6]
                mobile_phone = name_list[7]
                # 实例化字典存储对象
                Repair_item = RepairSpiderItem()
                Repair_item['city_name'] = city_name
                Repair_item['area_name'] = area_name
                Repair_item['company_name'] = company_name
                Repair_item['company_address'] = company_address
                Repair_item['phone'] = phone
                Repair_item['mobile_phone'] = mobile_phone
                yield Repair_item

第三步，在setting里把保存的管道打通。也就是把默认的67，68行被注释的给解开

ITEM_PIPELINES = {
   'repair_spider.pipelines.RepairSpiderPipeline': 300,
}

第四步，在pipelines中定义存储

import pymysql


class RepairSpiderPipeline(object):
    def __init__(self):
        self.coon = pymysql.connect(host='193.112.180.37', user='root', password='***********', port=3306, db='*********')
        self.cursor = self.coon.cursor()
        self.cursor.execute("create table IF NOT EXISTS klkl_Service_shop(city_name char(50), area_name char(50), company_name char(100), company_address char(100), phone char(50), mobile_phone char(50));")
        self.coon.commit()

    def process_item(self, item, spider):
        sql = 'insert into klkl_Service_shop(city_name,area_name,company_name,company_address,phone,mobile_phone) values (%s,%s,%s,%s,%s,%s)'
        try:
            self.coon.ping(reconnect=True)
            self.cursor.execute(sql, (item['city_name'], item['area_name'], item['company_name'], item['company_address'], item['phone'],item['mobile_phone']))
            self.coon.commit()
            print('klkl_Service_shop提交成功')
        except:
            self.coon.rollback()
            print('klkl_Service_shop提交失败')
        return item