国家统计局 省市区 数据爬取

代码参考网络,进行修改整合,需要的自留

import requests
from bs4 import BeautifulSoup
import pymysql
import time

"""
从国家统计局爬取省市区数据
保存到数据库 或者 生成 txt 数据源
"""


class GetCitysToLocal(object):
    def __init__(self):
        # 保存到数据库
        # self.db = pymysql.connect(host="localhost", user="root", password="root", database="szjy")
        # self.savetodb()
        # self.db.close()

        # 生成省市去txt文件
        self.getSSQ()

    def savetodb(self):
        # 年份
        year = 2020
        base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%s/' % year
        sql = "insert into dict_administrative_area (area_code,area_name,parent_code,parent_id,area_level) values (%s,%s,%s,%s,%s)"
        trs = self.get_response(base_url, 'provincetr')
        for tr in trs:  # 循环每一行
            for td in tr:  # 循环每个省
                if td.a is None:
                    continue
                href_url = td.a.get('href')
                province_name = td.a.get_text()
                province_code = str(href_url.split(".")[0]) + "0000000000";
                province_url = base_url + href_url

                # print(province_code)
                print(province_name)
                # print(province_url)

                # 插入省份数据并获取主键
                province_data = [province_code, province_name, '0', 0, 1]
                province_id = self.connect_mysql(sql, province_data)

                trs = self.get_response(province_url, None)
                for tr in trs[1:]:  # 循环每个市
                    city_code = tr.find_all('td')[0].string
                    city_name = tr.find_all('td')[1].string

                    # 插入城市数据并获取主键
                    city_data = [city_code, city_name, province_code, province_id, 2]
                    city_id = self.connect_mysql(sql, city_data)

                    city_url = base_url + tr.find_all('td')[1].a.get('href')
                    trs = self.get_response(city_url, None)
                    for tr in trs[1:]:  # 循环每个区县
                        county_code = tr.find_all('td')[0].string
                        county_name = tr.find_all('td')[1].string

                        # 插入区县数据并获取主键
                        county_data = [county_code, county_name, city_code, city_id, 3]
                        county_id = self.connect_mysql(sql, county_data)

                # time.sleep(1)
                time.sleep(1)
            time.sleep(1)

    @staticmethod
    def get_response(url, attr):
        response = requests.get(url)
        response.encoding = 'gb2312'  # 编码转换
        soup = BeautifulSoup(response.text, features="html.parser")
        table = soup.find_all('tbody')[1].tbody.tbody.table
        if attr:
            trs = table.find_all('tr', attrs={'class': attr})
        else:
            trs = table.find_all('tr')
        return trs

    def connect_mysql(self, sql, data):
        cursor = self.db.cursor()
        try:
            result = None
            if data:
                if isinstance(data[0], list):
                    cursor.executemany(sql, data)
                    result = self.db.insert_id()
                    print()
                else:
                    cursor.execute(sql, data)
                    result = self.db.insert_id()
            else:
                cursor.execute(sql)
                cursor.fetchall()
                result = self.db.insert_id()
        except Exception as e:
            print(e)
            self.db.rollback();
        finally:
            cursor.close()
            self.db.commit();  # 提交操作
            return result

    # 创建文件
    # file_path:文件路径
    # msg:即要写入的内容
    @staticmethod
    def create_file(file_path, msg):
        f = open(file_path, "a", encoding='utf-8')
        f.write(msg)
        f.close

    def getSSQ(self):
        # 年份
        year = 2020
        base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%s/' % year
        trs = self.get_response(base_url, 'provincetr')
        provinces = []
        cityss = []
        areas = []
        for tr in trs:  # 循环每一行
            # 循环每个省
            i = 0
            for td in tr:
                if td.a is None:
                    continue
                href_url = td.a.get('href')
                province_name = td.a.get_text()
                province_code = str(href_url.split(".")[0]);
                province_url = base_url + href_url

                # print(province_code)
                # print(province_name)
                # print(province_url)

                province = {}
                province['label'] = province_name
                province['value'] = province_code

                provinces.append(province)

                # 插入省份数据并获取主键
                i +=1

                citys = []
                countyss = []
                # 循环每个市
                trs = self.get_response(province_url, None)
                j = 0
                for tr in trs[1:]:
                    city_code = tr.find_all('td')[0].string
                    city_code = city_code[0:3]
                    city_name = tr.find_all('td')[1].string

                    # print(city_name)
                    # print(city_code)
                    city = {}
                    city['label'] = city_name
                    city['value'] = city_code

                    # 插入城市数据并获取主键
                    citys.append(city)

                    j +=1;

                    # 循环每个区县
                    countys = []
                    city_url = base_url + tr.find_all('td')[1].a.get('href')
                    trs = self.get_response(city_url, None)
                    for tr in trs[1:]:
                        county_code = tr.find_all('td')[0].string
                        county_code = county_code[0:5]
                        county_name = tr.find_all('td')[1].string

                        # print(county_name)
                        # print(county_code)

                        county = {}
                        county['label'] = county_name
                        county['value'] = county_code
                        countys.append(county)

                        # 插入区县数据并获取主键
                    countyss.append(countys)

                cityss.append(citys)
                areas.append(countyss)

            time.sleep(1)
        # 写入所有省的数据源
        # print(str(provinces))
        # print(str(cityss))
        # print(str(areas))

        self.create_file('./province.txt',str(provinces))
        self.create_file('./city.txt',str(cityss))
        self.create_file('./area.txt',str(areas))


if __name__ == '__main__':
    GetCitysToLocal()

国家统计局抓取的地图省市区划代码和城划分代码(最新2020/06/03),共596071条数据。来源于国家统计局http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/。 数据结构: CREATE TABLE `area` ( `areaid` varchar(255) COLLATE utf8_unicode_ci NOT NULL, `area_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `fatherid` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `area_type` int(255) DEFAULT NULL COMMENT '区域代码:\r\n100 :城镇,110:城区,111 :主城区,112 :城乡结合区,120 :镇区,121 :镇中心区,122:镇乡结合区,123:特殊区域200 :乡村,210:乡中心区,220:村庄\r\n\r\n', `is_delete` int(255) DEFAULT '0', PRIMARY KEY (`areaid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 部分数据: INSERT INTO `area` VALUES ('110000000000','北京市',NULL,NULL,0); INSERT INTO `area` VALUES ('110100000000','市辖区','110000000000',NULL,0); INSERT INTO `area` VALUES ('110101000000','东城区','110100000000',NULL,0); INSERT INTO `area` VALUES ('110101001000','东华门街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101001001','多福巷社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001002','银闸社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001005','东厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001006','智德社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001007','南池子社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001008','黄图岗社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001009','灯市口社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001010','正义路社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001011','甘雨社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001013','台基厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001014','韶九社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001015','王府井社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101002000','景山街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101002001','隆福寺社区居委会','110101002000',111,0); INSERT INTO `area` VALUES ('110101002002','吉祥社区居
以下是Python爬取国家统计局省市区详细数据的代码: ```python import requests import json # 请求头信息 headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Origin': 'http://www.stats.gov.cn', 'Referer': 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36' } # 份列表请求URL province_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html' # 获取份列表 def get_province(): response = requests.get(province_url, headers=headers) response.encoding = 'gbk' province_list = response.text.split('<td><a href="')[1:] for province in province_list: province_code, province_name = province.split('.html">')[0], province.split('.html">')[1].split('</a></td>')[0] print(province_code, province_name) # 城市、区、街道请求URL city_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{}/{}/{}.html' # 获取城市、区、街道数据 def get_data(url): response = requests.get(url, headers=headers) response.encoding = 'gbk' data_list = response.text.split('<tr class="')[1:] for data in data_list: if 'countytr' in data: code, name = data.split('<td>')[0].split('href="')[1].split('.html')[0][-6:], data.split('<td>')[1].split('</td>')[0] print(code, name) elif 'towntr' in data: code, name = data.split('<td>')[0].split('href="')[1].split('.html')[0][-9:], data.split('<td>')[1].split('</td>')[0] print(code, name) elif 'villagetr' in data: code, name = data.split('<td>')[0].split('</a>')[0][-12:], data.split('<td>')[2].split('</td>')[0] print(code, name) # 爬取数据 def spider(): # 获取份列表 response = requests.get(province_url, headers=headers) response.encoding = 'gbk' province_list = response.text.split('<td><a href="')[1:] for province in province_list: province_code = province.split('.html">')[0] province_name = province.split('.html">')[1].split('</a></td>')[0] print(province_code, province_name) # 获取城市数据 city_code = province_code[:2] city_url_now = city_url.format(city_code, province_code, city_code+province_code) get_data(city_url_now) # 获取区、街道数据 if city_code in {'11', '12', '31', '50'}: area_code = province_code[:6] area_url_now = city_url.format(city_code+province_code[:2], area_code, area_code) get_data(area_url_now) else: city_list_url = city_url.format(city_code+province_code[:2], city_code+province_code, city_code+province_code) response = requests.get(city_list_url, headers=headers) response.encoding = 'gbk' city_list = response.text.split('<td><a href="')[1:] for city in city_list: city_code_now = city.split('.html">')[0][-4:] city_url_now = city_url.format(city_code+province_code[:2], city_code_now, city_code+city_code_now) get_data(city_url_now) if __name__ == '__main__': spider() ``` 代码中首先定义了请求头信息,然后定义了份列表的请求URL,通过get_province()方法爬取份列表。 之后定义了城市、区、街道的请求URL格式,再通过get_data()方法爬取数据。 最后在spider()方法中,先爬取份列表,再根据份代码获取城市数据,接着判断是否需要获取区、街道数据,并获取相应数据
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值