Python抓取国家统计局地址数据

部署运行你感兴趣的模型镜像

    在日常应用中,我们很多应用都会用到地址数据,地址数据包含名字和行政区划代码,在国家统计局网站有公开行政区划地址数据,国家统计局的行政区划地址数据有将近70万条,用人工来操作显然不现实,用程序处理就很easy了。整个行政区划代码数据分为省、市/州、区/县、乡/镇、村五个级别。

    程序使用Python的BeautifulSoup(也就是BS4)模块来解析html。话不多说,完整代码如下:

   注:抓取后保存为insert语句。国家统计局统计用区划代码和城乡划分代码地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
import sys
import os
import re
from bs4 import BeautifulSoup
import string

#设置请求头
request_headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.5",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "_trs_uv=jz3i785b_6_2zxi; AD_RS_COOKIE=20088745",
    "Host": "www.stats.gov.cn",
    "DNT": "1",
    "If-Modified-Since": "Thu, 10 Sep 2020 05:53:29 GMT",
    "If-None-Match": "1c98-580baa54b4840-gzip",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0"
}

#insert语句的索引,当达到指定值后重新生成insert
sqlSaveIndex = 1
#当一条insert 的 values达到该值后重新生成新的一条insert
sqlSaveIndexEnd = 10000
#保存的文件名
saveFileName = "data/areacode2020-all.sql"
# saveFileName = "data/areacode2020-simple.sql"

####function echo() start######
def echo( param,*args ):
    if len(args)==0:
        print(param)
    else:
        for var in args:
            if var=='':
                print(param,end='')
            else:
                print(param)
####function echo() end#######

def writeSql(sql):
    try:
        fp=open(saveFileName,"a+",encoding="utf-8")
        fp.write(sql)#\n用来换行
    finally:
        fp.close()

############function: replaceLastChar 替换末尾的,为;#######################
def replaceLastChar():
    #sed -i 's/,\(\w*$\)/;\1/g' data/areacode.sql
    with open(saveFileName, 'r+') as fo:
        filedata = fo.read(-1)
    if filedata.strip() == '':
        echo("error: content is null")
        sys.exit(0)
    if filedata.strip().endswith(',') == True:
        filedata = filedata.strip().rstrip(',')
        filedata = filedata + ';'
    try:
        fp=open(saveFileName,"w+",encoding="utf-8")
        fp.write(filedata)
    finally:
        fp.close()

def echoinfo(name,code):
    print("areaName: %s,areaCode: %s" % (name,code))

def createTableMySQL():
    create_tb_cmd = '''
            CREATE TABLE IF NOT EXISTS areacode2020 (
            code  varchar(20) PRIMARY KEY NOT NULL COMMENT '地址code',
            area_name  varchar(255) DEFAULT '' COMMENT '名字',
            type  int COMMENT '级别,1:省,2:市/州,3区县,4乡镇,5村',
            parent_code varchar(20) COMMENT '父级code ',
            KEY `areacode_index` (`parent_code`)
            ) DEFAULT CHARSET=utf8 COMMENT='地址表2020';\n
    '''
    return create_tb_cmd

def createTablePgSQL():
    sql = '''
    CREATE TABLE if not exists public.areacode2020 (
        code varchar(20) NULL,
        area_name text NULL,
        "type" integer NULL,
        parent_code varchar(20) NULL,
        CONSTRAINT areacode2020_pk PRIMARY KEY (code)
    );
    CREATE INDEX areacode2020_parent_code_idx ON public.areacode2020 (parent_code);
    CREATE INDEX areacode2020_type_idx ON public.areacode2020 ("type");
    COMMENT ON TABLE public.areacode2020 IS '地址表2020';
    COMMENT ON COLUMN public.areacode2020.code IS '地址code';
    COMMENT ON COLUMN public.areacode2020.area_name IS '名字';
    COMMENT ON COLUMN public.areacode2020."type" IS '级别,1:省,2:市/州,3区县,4乡镇,5村';
    COMMENT ON COLUMN public.areacode2020.parent_code IS '父级code';
    '''
    return sql
def getItem(itemData, dataArray, parentRequestUrl, table, type):
    global sqlSaveIndex
    item = {}
    # 名称
    if(type == 5):
        item['name'] = str(dataArray[2].get_text())
    else:
        item['name'] = str(dataArray[1].get_text())
    # 下一级请求url
    href = re.findall('(.*)/', parentRequestUrl)
    if type != 5:
        item['url'] = href[0] + "/" + dataArray[0].get('href')
    # 父级code
    item['parentCode'] = itemData.get('code')
    # 类型
    item['type'] = type
    # code码
    item['code'] = str(dataArray[0].get_text())[0:12]
    # if type == 4:
    #     print(item.get('url'))
    # 打印出sql语句
    #print('insert into areacodeinfo(area,code,type,parent_code) values (%s,%s,%s,%s)' % (item['name'], item['code'], item['type'], item['parentCode']) + ";")
    echoinfo(item['name'], item['code'])
    if sqlSaveIndex == 1:
        writeSql("insert into areacode2020(area_name,code,type,parent_code) values ('%s','%s',%s,'%s')" % (item['name'], item['code'], item['type'], item['parentCode']) + ",")
    elif sqlSaveIndex == sqlSaveIndexEnd:
        writeSql("('%s','%s',%s,'%s')" % (item['name'], item['code'], item['type'], item['parentCode']) + ";\n")
        sqlSaveIndex = 0
    else:
        writeSql("('%s','%s',%s,'%s')" % (item['name'], item['code'], item['type'], item['parentCode']) + ",")
    sqlSaveIndex +=1
    return item

# 获取BeautifulSoup
def getSoup(requestUrl):
    htmls = requests.get(requestUrl, headers=request_headers)
    htmls.encoding = 'GBK'
    #soup = BeautifulSoup(htmls.text, 'html.parser', from_encoding='UTF-8')
    echo(htmls.text)
    soup = BeautifulSoup(htmls.text, 'html.parser')
    return soup

# 循环处理
def forItem(soup, label, labelClass, labelChild, item, requestUrl, type, tableName, lists):
    for link in soup.find_all(label, labelClass):
        array = link.find_all(labelChild, class_='')
        if not len(array):
            continue
        itemData = getItem(item, array, requestUrl, tableName, type)
        lists.append(itemData)


# 省列表
def getProvince(provinceList,proviceUrl):
    soup = getSoup(proviceUrl)
    for link in soup.find_all('a', class_=''):
    #for link in soup.find_all(href=re.compile('^52.html')):
        requestCityUrl = re.findall('(.*)/', proviceUrl)
        item = {}
        # 名称
        item['name'] = str(link.get_text())
        # 下一级请求url
        href = str(link.get('href'))
        item['url'] = requestCityUrl[0] + "/" + href
        # 父级code
        item['parentCode'] = '0'
        # 类型
        item['type'] = 1
        # code码
        #item['code'] = (href.split('.'))[0] + '0000000000'
        item['code'] = (href.split('.'))[0]
        provinceList.append(item)
        # 打印出sql语句
        # print('====>',types)
        writeSql("insert into areacode2020(area_name,code,type,parent_code) values ('%s','%s',%s,'%s')" % ((item['name']), item['code'], item['type'], item['parentCode']) + ";\n")
        echoinfo(item['name'],item['code'])
    return provinceList

# 市/州列表
def getCityList(provinceList,cityList):
    for item in provinceList:
        cityRequestUrl = str(item.get('url'))
        soup = getSoup(item.get('url'))
        forItem(soup, 'tr', 'citytr', 'a', item, cityRequestUrl, 2, 'city', cityList)
    return cityList
# 区/县列表
def getCountyList(cityList,countyList):
    for item in cityList:
        countyRequestUrl = str(item.get('url'))
        soup = getSoup(item.get('url'))
        forItem(soup, 'tr', 'countytr', 'a', item, countyRequestUrl, 3, 'county', countyList)
    return countyList
# 城镇列表
def getTownList(countyList,townList):
    for item in countyList:
        townRequestUrl = str(item.get('url'))
        soup = getSoup(item.get('url'))
        forItem(soup, 'tr', 'towntr', 'a', item, townRequestUrl, 4, 'town', townList)
    return townList
# 村庄列表
def getVillageList(townList,villageList):
    for item in townList:
        villageRequestUrl = str(item.get('url'))
        soup = getSoup(item.get('url'))
        forItem(soup, 'tr', 'villagetr', 'td', item,villageRequestUrl, 5, 'village', villageList)
    return villageList


def main():
    proviceUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
    if not os.path.exists('data'):
        os.mkdir('data')
    provinceList = []
    cityList = []
    countyList = []
    townList = []
    villageList = []
    provinceList = getProvince(provinceList,proviceUrl)
    cityList = getCityList(provinceList,cityList)
    countyList = getCountyList(cityList,countyList)
    townList = getTownList(countyList,townList)
    getVillageList(townList,villageList)
    #将最后的,变成;
    replaceLastChar()

if __name__ == "__main__":
    main()

详情请关注(最新代码及地址数据):https://github.com/hlinfocc/areacode

如何对您有帮助,请点个Star 呗

您可能感兴趣的与本文相关的镜像

Python3.8

Python3.8

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

Python爬取国家统计局的人口数据通常涉及网络爬虫技术,可以使用requests库获取网页内容,然后使用BeautifulSoup或lxml等解析库来解析HTML,提取需要的数据。以下是一个简的步骤: 1. **安装必要的库**: 首先确保已安装`requests`, `beautifulsoup4`, 可能还需要`lxml`,如果还未安装,可以运行: ``` pip install requests beautifulsoup4 lxml ``` 2. **确定目标网址**: 国家统计局的官网可能会有API接口提供数据,如果没有,你需要找到包含人口数据的网页链接。 3. **发送HTTP请求**: 使用requests.get()函数发送GET请求,获取网页源代码: ```python url = "https://example.gov/statistics" # 替换为你找到的URL response = requests.get(url) ``` 4. **解析HTML**: 使用BeautifulSoup解析响应的内容: ```python soup = BeautifulSoup(response.text, 'lxml') population_data = soup.find_all('div', class_='population-data') # 示例,查找特定CSS类 ``` 5. **提取数据**: 根据网页结构,提取关键信息如数字、表格或JSON数据: ```python data_elements = [element.text for element in population_data] total_population = data_elements[0] # 假设总人口数据在第一个元素 ``` 6. **保存数据**: 将提取到的数据存储到文件或数据库中。 请注意,实际操作时需要检查网站的robots.txt文件以及是否允许爬取,遵守相关规定。另外,频繁抓取可能会被封IP,所以通常推荐设置延迟或使用代理IP。
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值