全国统计用区划代码和城乡划分代码

1.爬取过程需要增加代理ip,否则会出现验证码的情况。

2.当前没有增加失败次数的限制,线上跑了一次单进程的话一晚上也跑完了,大概61万的数据量。

3.cookie都是没有增加的,当一个新的ip访问的时候,基本上是不需要出验证码的。

4.在程序开始的时候增加了一个pass_list的列表,是从原表中查到哪些乡镇是爬取过的,之后不再爬取,以此来增加断点续爬的效率。

5.断点续爬也可以再增加省份、城市判断,但是需要将列表的最后一个删掉,因为可能是爬取某个城市的过程中出现了错误没有爬完。

6.页面可能会出现转码的问题,发现gbk  或者utf-8或者gb2312都不行,后来用了ISO-8859-1才可以。

# -*-coding:utf-8 -*-
import time
import requests


from lxml import etree
from conf.dbr import random_ip
from conf.conn import Pymsql_conn




headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Upgrade-Insecure-Requests": "1",
    # "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Host": "www.stats.gov.cn",
}




# 根据主页面解析省份及对应的URL之后将内容放到列表中返回
def parse_province(main_url):
    """
    根据URL解析出来页面的省份信息及省份对应的URL地址
    :param main_url:
    :return:
    """
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=main_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e)
            if response:
                # province_html = response.text.encode(encoding=response.encoding).decode('gbk')
                province_html = response.text.encode(encoding=response.encoding).decode('gb2312')
                tree = etree.HTML(province_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="provincetr"]/td/a')
                    # print(result_list)
                    province_list = []
                    for i in result_list:
                        province = i.xpath('./text()')[0]
                        province_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./@href')[0]
                        print(province, province_url)
                        province_list.append((province, province_url))
                    # print(province_list)
                    if province_list:
                        return province_list
                except Exception as e:
                    print(province_html)
                    print(e, '省份xpath解析失败')
        else:
            print('无ip睡眠3秒-province')
            time.sleep(3)




# 传来的参数省份、省份对应的URL,之后返回当前省份页面内的城市信息
def parse_city(province, province_url):
    """
    解析省份页面内容
    :param province:
    :param province_url:
    :return:
    """
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=province_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e)
            if response:
                try:
                    city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                except Exception as e:
                    print(e)
                    try:
                        city_html = response.text.encode(encoding=response.encoding).decode('gbk')
                    except Exception as e:
                        print(e)
                        city_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(city_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="citytr"]')
                    city_list = []
                    for i in result_list:
                        city_code = i.xpath('./td[1]/a/text()')[0]
                        city = i.xpath('./td[2]/a/text()')[0]
                        city_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./td[2]/a/@href')[0]
                        print(province, city_code, city, city_url)
                        city_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "city_url": city_url,
                        })
                    # print(city_list)
                    if city_list:
                        return city_list
                except Exception as e:
                    print(city_html)
                    print(e, '城市xpath解析失败')
        else:
            print('无ip睡眠3秒-city')
            time.sleep(3)




# 通过城市解析区县
def parse_district(city_dict):  # country
    """
    通过传来的city_dict解析城市的区县
    :param city_dict:
    :return:
    """
    province = city_dict.get('province')
    city_code = city_dict.get('city_code')
    city = city_dict.get('city')
    city_url = city_dict.get('city_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=city_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '城市信息请求失败')
            if response:
                try:
                    city_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        city_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(city_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="countytr"]')
                    district_list = []
                    for i in result_list:
                        district_code = i.xpath('./td[1]/a/text()')
                        # 判断是否为空
                        if district_code:
                            district_code = district_code[0]
                            district = i.xpath('./td[2]/a/text()')[0]
                            href = i.xpath('./td[2]/a/@href')[0]
                            num = href.split('/')[-1][0:2]
                            district_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num}/' + href
                            print(province, city_code, city, district_code, district, district_url)
                            district_list.append({
                                "province": province,
                                "city_code": city_code,
                                "city": city,
                                "district_code": district_code,
                                "district": district,
                                "district_url": district_url,
                            })
                    # print(district_list)
                    return district_list
                except Exception as e:
                    print(city_html)
                    print(e, '区县解析失败')
        else:
            print('无ip睡眠3秒-district')
            time.sleep(3)




# 解析城镇
def parse_town(district_dict):
    """
    通过区县解析乡镇信息
    :param district_dict:
    :return:
    """
    province = district_dict.get('province')
    city_code = district_dict.get('city_code')
    city = district_dict.get('city')
    district_code = district_dict.get('district_code')
    district = district_dict.get('district')
    district_url = district_dict.get('district_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=district_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '乡镇信息请求失败')
            if response:
                try:
                    district_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        district_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        district_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(district_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="towntr"]')
                    town_list = []
                    for i in result_list:
                        town_code = i.xpath('./td[1]/a/text()')[0]
                        town = i.xpath('./td[2]/a/text()')[0]
                        href = i.xpath('./td[2]/a/@href')[0]
                        num1 = href.split('/')[-1][0:2]
                        num2 = href.split('/')[-1][2:4]
                        town_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num1}/{num2}/' + href
                        print(town_code, town, town_url)
                        town_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "district_code": district_code,
                            "district": district,
                            "town_code": town_code,
                            "town": town,
                            "town_url": town_url,
                        })
                    # print(town_list)
                    return town_list
                except Exception as e:
                    print(district_html)
                    print(e, '乡镇解析失败')
        else:
            print('无ip睡眠3秒-town')
            time.sleep(3)




# 解析村委会居委会
def parse_village(town_dict):
    """
    根据城镇解析村委会居委会
    :param town_dict:
    :return:
    """
    province = town_dict.get('province')
    city_code = town_dict.get('city_code')
    city = town_dict.get('city')
    district_code = town_dict.get('district_code')
    district = town_dict.get('district')
    town_code = town_dict.get('town_code')
    town = town_dict.get('town')
    town_url = town_dict.get('town_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=town_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '村委会居委会信息请求失败')
            # print(response.text)
            if response:
                try:
                    village_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        village_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        # village_html = response.text.encode(encoding=response.encoding).decode('ISO-8859-1').encode('ISO-8859-1')
                        village_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(village_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="villagetr"]')
                    village_list = []
                    for i in result_list:
                        village_code = i.xpath('./td[1]/text()')[0]
                        town_country_code = i.xpath('./td[2]/text()')[0]
                        village = i.xpath('./td[3]/text()')[0]
                        print(village_code, town_country_code, village)
                        village_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "district_code": district_code,
                            "district": district,
                            "town_code": town_code,
                            "town": town,
                            "village_code": village_code,
                            "village": village,
                            "town_country_code": town_country_code,
                        })
                    # print(village_list)
                    return village_list
                except Exception as e:
                    print(village_html)
                    print(e, '村委会居委会解析失败')
        else:
            print('无ip睡眠3秒-village')
            time.sleep(3)




# 将数据sql入库
def sql_in(task_list):
    pm = Pymsql_conn()
    for task in task_list:
        province = task.get('province')
        city_code = task.get('city_code')
        city = task.get('city')
        district_code = task.get('district_code')
        district = task.get('district')
        town_code = task.get('town_code')
        town = task.get('town')
        village_code = task.get('village_code')
        village = task.get('village')
        town_country_code = task.get('town_country_code')
        val = (province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code)
        # print(val)
        insert_sql = f'insert into national_statistics_code(province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code) values {val}'
        try:
            pm.insert_info(sql=insert_sql)
        except Exception as e:
            print(e, insert_sql)
    pm.commit_info()
    pm.mysql_close()




# 程序开启
def server_run():
    main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
    province_list = parse_province(main_url)
    for i in province_list:
        province, province_url = i[0], i[1]
        if province not in ['北京市', '天津市']:  # 此处是屏蔽一些省份一些爬过的省份就不再爬取了
            city_list = parse_city(province, province_url)
            print(province,'province')
            for city in city_list:
                print(city.get('city'), 'city')
                if city.get('city') != '石家庄市':
                    district_list = parse_district(city)
                    for district in district_list:
                        town_list = parse_town(district)
                        for town in town_list:
                            print(province, city.get('city'), town.get('town'))
                            if town.get('town') not in pass_list:
                                vil_list = parse_village(town)
                                sql_in(vil_list)  # sql入库




if __name__ == '__main__':
    pms = Pymsql_conn()
    pass_list = pms.check_info(sql='SELECT distinct(town) from national_statistics_code;')
    pass_list = [i.get('town') for i in pass_list]  # 获取下已爬取的位置 爬取过的不再爬取
    print(pass_list)
    pms.mysql_close()
    server_run()
    # main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
    # parse_province(main_url)
    # province = town_dict.get('province')
    # city_code = town_dict.get('city_code')
    # city = town_dict.get('city')
    # district_code = town_dict.get('district_code')
    # district = town_dict.get('district')
    # town_code = town_dict.get('town_code')
    # town = town_dict.get('town')
    # town_url = town_dict.get('town_url')
    # town_dict = {
    #     "province":"河北省",
    #     "city_code":"130229000000",
    #     "city":"唐山市",
    #     "district_code":"130229201000",
    #     "district":"玉田县",
    #     "town_code":"130229203000",
    #     "town":"潮洛窝乡",
    #     "town_url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/13/02/29/130229203.html",
    # }
    # parse_village(town_dict)

统计区划代码城乡划分代码发布说明: 一、编制依据 2008年7月,国务院批复同意国家统计局与民政部、住建部、公安部、财政部、国土部、农业部共同制定的《关于统计划分城乡的规定》(国函〔2008〕60号),自2008年8月1日实施,正式奠定了统计划分城乡的理论依据方法基础。随后,国家统计局印发《统计区划代码城乡划分代码编制规则》(国统字〔2009〕91号)。 二、区划范围 统计区划代码城乡划分代码区划范围,是国家统计局开展统计调查的区划范围。未包括我国台湾省、香港特别行政区、澳门特别行政区。 三、发布内容 12位统计区划代码3位城乡分类代码。 四、适用领域 《国务院关于统计划分城乡规定的批复》(国函〔2008〕60号)明确指出:“本规定作为统计划分城乡的依据,不改变现有的行政区划、隶属关系、管理权限机构编制,以及土地规划、城乡规划等有关规定”。各级各部门在使用统计区划代码城乡划分代码时,请务必结合实际情况。 五、几个具体问题的说明 (一)补充编制开发区统计汇总识别码情况。为满足统计调查工作组织数据汇总的需要,国家统计局对一些符合条件的开发区编制了统计汇总识别码。统计汇总识别码在统计区划代码的县级码段上编制,其码段为71~80。 (二)关于河北省沧州市任丘市的苟各庄镇、鄚州镇、七间房乡、保定市高阳县的龙化乡统计区划代码临时调整情况的说明。按照河北省委、省政府关于对雄安新区周边部分区域实施托管的通知要求,沧州市任丘市的苟各庄镇、鄚州镇、七间房乡划归雄县实施统计上托管,保定市高阳县的龙化乡划归安新县实施统计上托管。为确保统计调查工作的顺利开展, 国家统计局对苟各庄镇、鄚州镇、七间房乡、龙化乡的统计用十二位区划代码进行了临时调整,具体调整为:鄚州镇代码由130982104000变更为130638106000;苟各庄镇代码由130982105000变更为130638107000;七间房乡代码由130982206000变更为130638205000;龙化乡代码由130628204000变更为130632203000。上述变更后的统计区划代码为临时代码,待民政部门对雄安新区上述4个乡镇区划调整确认后,再将临时代码变更为正式统计区划代码。 (三)关于黑龙江省大兴安岭地区县级单位统计区划代码调整情况说明。民政部民函〔2018〕50号文件撤销黑龙江省大兴安岭地区漠河县(六位区划代码为232723),设立漠河市(六位区划代码为232701)。为执行国家标准,保证统计部门与民政部门名称相同的县级单位六位区划代码的一致性,国家统计局根据《统计区划代码城乡划分代码编制规则》(国统字〔2009〕91号),调整黑龙江省大兴安岭地区所辖的加格达奇区、松岭区、新林区呼中区的六位统计区划代码,具体调整为:加格达奇区代码由232701变更为232761;松岭区代码由232702变更为232762;新林区代码由232703变更为232763;呼中区代码由232704变更为232764。 (四)此版本区划代码与第四次全国经济普查区划代码的相关说明。此版本区划代码是调查截止日期为2018年10月31日的统计区划代码。由于第四次全国经济普查清查工作于2018年8月开始,四经普的清查登记工作中采用2018年6月15日的统计区划代码。第四次全国经济普查数据处理使用2018年10月31日的统计区划代码
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值