国家统计局省市区数据爬取

最新推荐文章于 2025-03-19 23:01:25 发布
tang1jun
最新推荐文章于 2025-03-19 23:01:25 发布
阅读量1.6k
点赞数 1
分类专栏： python3
本文为博主原创文章，未经博主允许不得(商用)转载。
本文链接：https://blog.youkuaiyun.com/tang1jun/article/details/115660527
版权
python3 专栏收录该内容
1 篇文章
订阅专栏
代码参考网络，进行修改整合，需要的自留
import requests
from bs4 import BeautifulSoup
import pymysql
import time

"""
从国家统计局爬取省市区数据
保存到数据库 或者 生成 txt 数据源
"""


class GetCitysToLocal(object):
    def __init__(self):
        # 保存到数据库
        # self.db = pymysql.connect(host="localhost", user="root", password="root", database="szjy")
        # self.savetodb()
        # self.db.close()

        # 生成省市去txt文件
        self.getSSQ()

    def savetodb(self):
        # 年份
        year = 2020
        base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%s/' % year
        sql = "insert into dict_administrative_area (area_code,area_name,parent_code,parent_id,area_level) values (%s,%s,%s,%s,%s)"
        trs = self.get_response(base_url, 'provincetr')
        for tr in trs:  # 循环每一行
            for td in tr:  # 循环每个省
                if td.a is None:
                    continue
                href_url = td.a.get('href')
                province_name = td.a.get_text()
                province_code = str(href_url.split(".")[0]) + "0000000000";
                province_url = base_url + href_url

                # print(province_code)
                print(province_name)
                # print(province_url)

                # 插入省份数据并获取主键
                province_data = [province_code, province_name, '0', 0, 1]
                province_id = self.connect_mysql(sql, province_data)

                trs = self.get_response(province_url, None)
                for tr in trs[1:]:  # 循环每个市
                    city_code = tr.find_all('td')[0].string
                    city_name = tr.find_all('td')[1].string

                    # 插入城市数据并获取主键
                    city_data = [city_code, city_name, province_code, province_id, 2]
                    city_id = self.connect_mysql(sql, city_data)

                    city_url = base_url + tr.find_all('td')[1].a.get('href')
                    trs = self.get_response(city_url, None)
                    for tr in trs[1:]:  # 循环每个区县
                        county_code = tr.find_all('td')[0].string
                        county_name = tr.find_all('td')[1].string

                        # 插入区县数据并获取主键
                        county_data = [county_code, county_name, city_code, city_id, 3]
                        county_id = self.connect_mysql(sql, county_data)

                # time.sleep(1)
                time.sleep(1)
            time.sleep(1)

    @staticmethod
    def get_response(url, attr):
        response = requests.get(url)
        response.encoding = 'gb2312'  # 编码转换
        soup = BeautifulSoup(response.text, features="html.parser")
        table = soup.find_all('tbody')[1].tbody.tbody.table
        if attr:
            trs = table.find_all('tr', attrs={'class': attr})
        else:
            trs = table.find_all('tr')
        return trs

    def connect_mysql(self, sql, data):
        cursor = self.db.cursor()
        try:
            result = None
            if data:
                if isinstance(data[0], list):
                    cursor.executemany(sql, data)
                    result = self.db.insert_id()
                    print()
                else:
                    cursor.execute(sql, data)
                    result = self.db.insert_id()
            else:
                cursor.execute(sql)
                cursor.fetchall()
                result = self.db.insert_id()
        except Exception as e:
            print(e)
            self.db.rollback();
        finally:
            cursor.close()
            self.db.commit();  # 提交操作
            return result

    # 创建文件
    # file_path：文件路径
    # msg：即要写入的内容
    @staticmethod
    def create_file(file_path, msg):
        f = open(file_path, "a", encoding='utf-8')
        f.write(msg)
        f.close

    def getSSQ(self):
        # 年份
        year = 2020
        base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/%s/' % year
        trs = self.get_response(base_url, 'provincetr')
        provinces = []
        cityss = []
        areas = []
        for tr in trs:  # 循环每一行
            # 循环每个省
            i = 0
            for td in tr:
                if td.a is None:
                    continue
                href_url = td.a.get('href')
                province_name = td.a.get_text()
                province_code = str(href_url.split(".")[0]);
                province_url = base_url + href_url

                # print(province_code)
                # print(province_name)
                # print(province_url)

                province = {}
                province['label'] = province_name
                province['value'] = province_code

                provinces.append(province)

                # 插入省份数据并获取主键
                i +=1

                citys = []
                countyss = []
                # 循环每个市
                trs = self.get_response(province_url, None)
                j = 0
                for tr in trs[1:]:
                    city_code = tr.find_all('td')[0].string
                    city_code = city_code[0:3]
                    city_name = tr.find_all('td')[1].string

                    # print(city_name)
                    # print(city_code)
                    city = {}
                    city['label'] = city_name
                    city['value'] = city_code

                    # 插入城市数据并获取主键
                    citys.append(city)

                    j +=1;

                    # 循环每个区县
                    countys = []
                    city_url = base_url + tr.find_all('td')[1].a.get('href')
                    trs = self.get_response(city_url, None)
                    for tr in trs[1:]:
                        county_code = tr.find_all('td')[0].string
                        county_code = county_code[0:5]
                        county_name = tr.find_all('td')[1].string

                        # print(county_name)
                        # print(county_code)

                        county = {}
                        county['label'] = county_name
                        county['value'] = county_code
                        countys.append(county)

                        # 插入区县数据并获取主键
                    countyss.append(countys)

                cityss.append(citys)
                areas.append(countyss)

            time.sleep(1)
        # 写入所有省的数据源
        # print(str(provinces))
        # print(str(cityss))
        # print(str(areas))

        self.create_file('./province.txt',str(provinces))
        self.create_file('./city.txt',str(cityss))
        self.create_file('./area.txt',str(areas))


if __name__ == '__main__':
    GetCitysToLocal()