xpath爬虫-抓取全国行政区划和城乡区划数据

数据来源地址:2020年度全国行政区划和城乡划

代码示例:以安徽省合肥市为例

import requests
from lxml import etree
import pandas as pd


def get_html(url):
    header = {'user-agent': '你自己的浏览器信息'}
    try:
        response = requests.get(url, headers=header)
        # 判断网页是否正确返回
        if response.status_code == 200:
            return response.content.decode('gbk')
        else:
            print("{0}网页请求状态码错误!{0}".format("-" * 10))
    except Exception as e:
        print("{0}请求参数出现错误:{1}{0}".format("-" * 10, e))


def parse_url(url, xpath_path):
    html = get_html(url)
    # 构建下一级跳转初始url部分
    next_base_url = "/".join(url.split("/")[:-1])
    # 初始化
    HTML = etree.HTML(html)
    # 获取区级名称和对应下一级链接
    all_area = HTML.xpath(f'{xpath_path}/text()')
    next_link = HTML.xpath(f'{xpath_path}/@href')

    return [(i[0], next_base_url + "/" + i[1]) for i in list(zip(all_area, next_link))]


def parse_url2(url, xpath_path):
    """最后一级,无跳转链接"""
    html = get_html(url)
    # 初始化
    HTML = etree.HTML(html)

    villagetr = HTML.xpath(f'{xpath_path}/text()')

    return villagetr


result = []
xpath_path = '//tr[@class="countytr"]/td[2]/a'
url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/34/3401.html"
# 市 get ==》 区:名字&链接
for i in parse_url(url, xpath_path):
    area1, url = i
    xpath_path = '//tr[@class="towntr"]/td[2]/a'
    # 区 get ==》 镇:名字&链接
    for j in parse_url(url, xpath_path):
        area2, url = j
        xpath_path = '//tr[@class ="villagetr"]/td[3]'
        # 镇 get ==》 街道:名字
        for k in parse_url2(url, xpath_path):
            result.append([area1, area2, k])

df = pd.DataFrame(result, columns=["区", "镇/街道", "居委会"])
df.to_excel("合肥市行政区域划分.xlsx", index=False)

从国家统计局抓取的地图省市区划代码城划分代码(最新2020/06/03),共596071条数据。来源于国家统计局http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/。 数据结构: CREATE TABLE `area` ( `areaid` varchar(255) COLLATE utf8_unicode_ci NOT NULL, `area_name` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `fatherid` varchar(255) COLLATE utf8_unicode_ci DEFAULT NULL, `area_type` int(255) DEFAULT NULL COMMENT '区域代码:\r\n100 :城镇,110:城区,111 :主城区,112 :城乡结合区,120 :镇区,121 :镇中心区,122:镇乡结合区,123:特殊区域200 :乡村,210:乡中心区,220:村庄\r\n\r\n', `is_delete` int(255) DEFAULT '0', PRIMARY KEY (`areaid`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 部分数据: INSERT INTO `area` VALUES ('110000000000','北京市',NULL,NULL,0); INSERT INTO `area` VALUES ('110100000000','市辖区','110000000000',NULL,0); INSERT INTO `area` VALUES ('110101000000','东城区','110100000000',NULL,0); INSERT INTO `area` VALUES ('110101001000','东华门街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101001001','多福巷社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001002','银闸社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001005','东厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001006','智德社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001007','南池子社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001008','黄图岗社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001009','灯市口社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001010','正义路社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001011','甘雨社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001013','台基厂社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001014','韶九社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101001015','王府井社区居委会','110101001000',111,0); INSERT INTO `area` VALUES ('110101002000','景山街道','110101000000',NULL,0); INSERT INTO `area` VALUES ('110101002001','隆福寺社区居委会','110101002000',111,0); INSERT INTO `area` VALUES ('110101002002','吉祥社区居
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

诡途

你的鼓励是我 创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值