全国统计用区划代码和城乡划分代码

原创于 2024-11-05 17:30:10 发布 · 1k 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#爬虫
1.爬取过程需要增加代理ip，否则会出现验证码的情况。
2.当前没有增加失败次数的限制，线上跑了一次单进程的话一晚上也跑完了，大概61万的数据量。
3.cookie都是没有增加的，当一个新的ip访问的时候，基本上是不需要出验证码的。
4.在程序开始的时候增加了一个pass_list的列表，是从原表中查到哪些乡镇是爬取过的，之后不再爬取，以此来增加断点续爬的效率。
5.断点续爬也可以再增加省份、城市判断，但是需要将列表的最后一个删掉，因为可能是爬取某个城市的过程中出现了错误没有爬完。
6.页面可能会出现转码的问题，发现gbk 或者utf-8或者gb2312都不行，后来用了ISO-8859-1才可以。
# -*-coding:utf-8 -*-
import time
import requests


from lxml import etree
from conf.dbr import random_ip
from conf.conn import Pymsql_conn




headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
    "Upgrade-Insecure-Requests": "1",
    # "Referer": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Host": "www.stats.gov.cn",
}




# 根据主页面解析省份及对应的URL之后将内容放到列表中返回
def parse_province(main_url):
    """
    根据URL解析出来页面的省份信息及省份对应的URL地址
    :param main_url:
    :return:
    """
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=main_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e)
            if response:
                # province_html = response.text.encode(encoding=response.encoding).decode('gbk')
                province_html = response.text.encode(encoding=response.encoding).decode('gb2312')
                tree = etree.HTML(province_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="provincetr"]/td/a')
                    # print(result_list)
                    province_list = []
                    for i in result_list:
                        province = i.xpath('./text()')[0]
                        province_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./@href')[0]
                        print(province, province_url)
                        province_list.append((province, province_url))
                    # print(province_list)
                    if province_list:
                        return province_list
                except Exception as e:
                    print(province_html)
                    print(e, '省份xpath解析失败')
        else:
            print('无ip睡眠3秒-province')
            time.sleep(3)




# 传来的参数省份、省份对应的URL，之后返回当前省份页面内的城市信息
def parse_city(province, province_url):
    """
    解析省份页面内容
    :param province:
    :param province_url:
    :return:
    """
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=province_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e)
            if response:
                try:
                    city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                except Exception as e:
                    print(e)
                    try:
                        city_html = response.text.encode(encoding=response.encoding).decode('gbk')
                    except Exception as e:
                        print(e)
                        city_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(city_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="citytr"]')
                    city_list = []
                    for i in result_list:
                        city_code = i.xpath('./td[1]/a/text()')[0]
                        city = i.xpath('./td[2]/a/text()')[0]
                        city_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/' + i.xpath('./td[2]/a/@href')[0]
                        print(province, city_code, city, city_url)
                        city_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "city_url": city_url,
                        })
                    # print(city_list)
                    if city_list:
                        return city_list
                except Exception as e:
                    print(city_html)
                    print(e, '城市xpath解析失败')
        else:
            print('无ip睡眠3秒-city')
            time.sleep(3)




# 通过城市解析区县
def parse_district(city_dict):  # country
    """
    通过传来的city_dict解析城市的区县
    :param city_dict:
    :return:
    """
    province = city_dict.get('province')
    city_code = city_dict.get('city_code')
    city = city_dict.get('city')
    city_url = city_dict.get('city_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=city_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '城市信息请求失败')
            if response:
                try:
                    city_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        city_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        city_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(city_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="countytr"]')
                    district_list = []
                    for i in result_list:
                        district_code = i.xpath('./td[1]/a/text()')
                        # 判断是否为空
                        if district_code:
                            district_code = district_code[0]
                            district = i.xpath('./td[2]/a/text()')[0]
                            href = i.xpath('./td[2]/a/@href')[0]
                            num = href.split('/')[-1][0:2]
                            district_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num}/' + href
                            print(province, city_code, city, district_code, district, district_url)
                            district_list.append({
                                "province": province,
                                "city_code": city_code,
                                "city": city,
                                "district_code": district_code,
                                "district": district,
                                "district_url": district_url,
                            })
                    # print(district_list)
                    return district_list
                except Exception as e:
                    print(city_html)
                    print(e, '区县解析失败')
        else:
            print('无ip睡眠3秒-district')
            time.sleep(3)




# 解析城镇
def parse_town(district_dict):
    """
    通过区县解析乡镇信息
    :param district_dict:
    :return:
    """
    province = district_dict.get('province')
    city_code = district_dict.get('city_code')
    city = district_dict.get('city')
    district_code = district_dict.get('district_code')
    district = district_dict.get('district')
    district_url = district_dict.get('district_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=district_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '乡镇信息请求失败')
            if response:
                try:
                    district_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        district_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        district_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(district_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="towntr"]')
                    town_list = []
                    for i in result_list:
                        town_code = i.xpath('./td[1]/a/text()')[0]
                        town = i.xpath('./td[2]/a/text()')[0]
                        href = i.xpath('./td[2]/a/@href')[0]
                        num1 = href.split('/')[-1][0:2]
                        num2 = href.split('/')[-1][2:4]
                        town_url = f'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/{num1}/{num2}/' + href
                        print(town_code, town, town_url)
                        town_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "district_code": district_code,
                            "district": district,
                            "town_code": town_code,
                            "town": town,
                            "town_url": town_url,
                        })
                    # print(town_list)
                    return town_list
                except Exception as e:
                    print(district_html)
                    print(e, '乡镇解析失败')
        else:
            print('无ip睡眠3秒-town')
            time.sleep(3)




# 解析村委会居委会
def parse_village(town_dict):
    """
    根据城镇解析村委会居委会
    :param town_dict:
    :return:
    """
    province = town_dict.get('province')
    city_code = town_dict.get('city_code')
    city = town_dict.get('city')
    district_code = town_dict.get('district_code')
    district = town_dict.get('district')
    town_code = town_dict.get('town_code')
    town = town_dict.get('town')
    town_url = town_dict.get('town_url')
    # headers['Cookie'] = ''
    response = None
    while 1:
        ip = random_ip()
        if ip:
            proxy = {
                "https": ip,
                "http": ip,
            }
            try:
                response = requests.get(url=town_url, headers=headers, proxies=proxy, timeout=1)
            except Exception as e:
                print(e, '村委会居委会信息请求失败')
            # print(response.text)
            if response:
                try:
                    village_html = response.text.encode(encoding=response.encoding).decode('gbk')
                except Exception as e:
                    print(e)
                    try:
                        village_html = response.text.encode(encoding=response.encoding).decode('utf-8')
                    except Exception as e:
                        print(e)
                        # village_html = response.text.encode(encoding=response.encoding).decode('ISO-8859-1').encode('ISO-8859-1')
                        village_html = response.text.encode(encoding='ISO-8859-1')
                tree = etree.HTML(village_html)
                try:
                    result_list = tree.xpath('/html/body//tr[@class="villagetr"]')
                    village_list = []
                    for i in result_list:
                        village_code = i.xpath('./td[1]/text()')[0]
                        town_country_code = i.xpath('./td[2]/text()')[0]
                        village = i.xpath('./td[3]/text()')[0]
                        print(village_code, town_country_code, village)
                        village_list.append({
                            "province": province,
                            "city_code": city_code,
                            "city": city,
                            "district_code": district_code,
                            "district": district,
                            "town_code": town_code,
                            "town": town,
                            "village_code": village_code,
                            "village": village,
                            "town_country_code": town_country_code,
                        })
                    # print(village_list)
                    return village_list
                except Exception as e:
                    print(village_html)
                    print(e, '村委会居委会解析失败')
        else:
            print('无ip睡眠3秒-village')
            time.sleep(3)




# 将数据sql入库
def sql_in(task_list):
    pm = Pymsql_conn()
    for task in task_list:
        province = task.get('province')
        city_code = task.get('city_code')
        city = task.get('city')
        district_code = task.get('district_code')
        district = task.get('district')
        town_code = task.get('town_code')
        town = task.get('town')
        village_code = task.get('village_code')
        village = task.get('village')
        town_country_code = task.get('town_country_code')
        val = (province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code)
        # print(val)
        insert_sql = f'insert into national_statistics_code(province,city_code,city,district_code,district,town_code,town,village_code,village,town_country_code) values {val}'
        try:
            pm.insert_info(sql=insert_sql)
        except Exception as e:
            print(e, insert_sql)
    pm.commit_info()
    pm.mysql_close()




# 程序开启
def server_run():
    main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
    province_list = parse_province(main_url)
    for i in province_list:
        province, province_url = i[0], i[1]
        if province not in ['北京市', '天津市']:  # 此处是屏蔽一些省份一些爬过的省份就不再爬取了
            city_list = parse_city(province, province_url)
            print(province,'province')
            for city in city_list:
                print(city.get('city'), 'city')
                if city.get('city') != '石家庄市':
                    district_list = parse_district(city)
                    for district in district_list:
                        town_list = parse_town(district)
                        for town in town_list:
                            print(province, city.get('city'), town.get('town'))
                            if town.get('town') not in pass_list:
                                vil_list = parse_village(town)
                                sql_in(vil_list)  # sql入库




if __name__ == '__main__':
    pms = Pymsql_conn()
    pass_list = pms.check_info(sql='SELECT distinct(town) from national_statistics_code;')
    pass_list = [i.get('town') for i in pass_list]  # 获取下已爬取的位置 爬取过的不再爬取
    print(pass_list)
    pms.mysql_close()
    server_run()
    # main_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html'
    # parse_province(main_url)
    # province = town_dict.get('province')
    # city_code = town_dict.get('city_code')
    # city = town_dict.get('city')
    # district_code = town_dict.get('district_code')
    # district = town_dict.get('district')
    # town_code = town_dict.get('town_code')
    # town = town_dict.get('town')
    # town_url = town_dict.get('town_url')
    # town_dict = {
    #     "province":"河北省",
    #     "city_code":"130229000000",
    #     "city":"唐山市",
    #     "district_code":"130229201000",
    #     "district":"玉田县",
    #     "town_code":"130229203000",
    #     "town":"潮洛窝乡",
    #     "town_url":"http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/13/02/29/130229203.html",
    # }
    # parse_village(town_dict)