Python获取全国所有的省、市、县、镇、村

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# author=He

"""
通过国家统计局数据
获取中国所有城市列表
"""
import sys
import os
import re
from urllib import request
from bs4 import BeautifulSoup
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/'
header = {
    'Cookie': 'AD_RS_COOKIE=20080917',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \ AppleWeb\Kit/537.36 (KHTML, like Gecko)\ '
                  'Chrome/58.0.3029.110 Safari/537.36'}


class GetHttp:
    def __init__(self, url, headers=None, charset='utf8'):
        if headers is None:
            headers = {}
        self._response = ''
        try:
            print(url)
            self._response = request.urlopen(request.Request(url=url, headers=headers))
        except Exception as e:
            print(e)
        self._c = charset

    @property
    def text(self):
        try:
            return self._response.read().decode(self._c)
        except Exception as e:
            print(e)
            return ''


def provincetr(u, he, lists):
    # 获取全国省份和直辖市
    t = GetHttp(u, he, 'gbk').text
    if t:
        soup = BeautifulSoup(t, 'html.parser')
        for i in soup.find_all(attrs={'class': 'provincetr'}):
            for a in i.find_all('a'):
                id = re.sub("\D", "", a.get('href'))
                lists[id] = {'id': id, 'name': a.text, 'pid': '0', 'pid1': '0', 'pid2': '0', 'pid3': '0', 'pid4': '0',
                             'code': id}
                # time.sleep(1 / 10)
    return lists


def citytr(u, he, lists):
    # 获取省下级市
    l = lists.copy()
    for i in l:
        t = GetHttp(u+i+'.html', he, 'gbk').text
        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'citytr'}):
            id = str(v.find_all('td')[0].text)
            if id[0:4] not in lists.keys():
                lists[id[0:4]] = {'id': id[0:4], 'name': str(v.find_all('td')[1].text),
                                  'pid': '0', 'pid1': i, 'pid2': '0', 'pid3': '0', 'pid4': '0', 'code': id}
    return lists


def countytr(u, he, lists):
    # 获取市下级县
    l = lists.copy()
    a = {}
    for i in l:
        t = GetHttp(u+i[0:2]+'/'+i+'.html', he, 'gbk').text
        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'countytr'}):
            id = str(v.find_all('td')[0].text)
            if id[0:6] not in lists.keys():
                lists[id[0:6]] = {'id': id[0:6], 'name': str(v.find_all('td')[1].text),
                                  'pid': '0', 'pid1': l[i]['pid1'], 'pid2': i, 'pid3': '0', 'pid4': '0', 'code': id}
    return lists


def towntr(u, he, lists):
    # 县下级镇
    l = lists.copy()
    for i in l:
        t = GetHttp(u+i[0:2]+'/'+i[2:4]+'/'+i+'.html', he, 'gbk').text
        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'towntr'}):
            id = str(v.find_all('td')[0].text)
            if id[0:9] not in lists.keys():
                lists[id[0:9]] = {'id': id[0:9], 'name': str(v.find_all('td')[1].text), 'pid': '0',
                                  'pid1': l[i]['pid1'], 'pid2': l[i]['pid2'], 'pid3': i, 'pid4': '0', 'code': id}
    return lists


def villagetr(u, he, lists):
    # 镇下级村
    l = lists.copy()
    for i in l:
        t = GetHttp(u+i[0:2]+'/'+i[2:4]+'/'+i[4:6]+'/'+i+'.html', he, 'gbk').text
        if not t:
            continue
        soup = BeautifulSoup(t, 'html.parser')
        for v in soup.find_all(attrs={'class': 'villagetr'}):
            id = str(v.find_all('td')[0].text)
            if id[0:12] not in lists.keys():
                lists[id[0:12]] = {'id': id[0:12], 'name': str(v.find_all('td')[1].text), 'pid': '0',
                                   'pid1': l[i]['pid1'], 'pid2': l[i]['pid2'], 'pid3': l[i]['pid2'], 'pid4': i,
                                   'code': id}
    return lists
p = provincetr(u=url, he=header, lists={})
print('省')
c = citytr(u=url, he=header, lists=p)
print('市')
o = countytr(u=url, he=header, lists=c)
print('县')
t = towntr(u=url, he=header, lists=o)
print('镇')
v = villagetr(u=url, he=header, lists=t)
print('村')
首先,需要使用Python的第三方库requests和lxml来获取和解析中关网站的页面。具体代码如下: ```python import requests from lxml import etree url = "http://detail.zol.com.cn/tablepc/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36" } response = requests.get(url, headers=headers) response.encoding = "utf-8" html = etree.HTML(response.text) ``` 接下来,使用xpath来获取平板电脑的所有字段。可以先使用浏览器的开发者工具来看网页源代码和元素的路径。例如,中关网站平板电脑的名称、价格和链接分别对应以下xpath表达式: ```python name_xpath = '//*[@id="J_PicMode"]/li/div/a/h3/text()' price_xpath = '//*[@id="J_PicMode"]/li/div/div[2]/span[1]/b/text()' link_xpath = '//*[@id="J_PicMode"]/li/div/a/@href' ``` 使用xpath的`xpath()`方法即可获取对应的元素。具体代码如下: ```python names = html.xpath(name_xpath) prices = html.xpath(price_xpath) links = html.xpath(link_xpath) ``` 最后,可以将这些字段放到一个列表中,以便后续处理。完整代码如下: ```python import requests from lxml import etree url = "http://detail.zol.com.cn/tablepc/" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36" } response = requests.get(url, headers=headers) response.encoding = "utf-8" html = etree.HTML(response.text) name_xpath = '//*[@id="J_PicMode"]/li/div/a/h3/text()' price_xpath = '//*[@id="J_PicMode"]/li/div/div[2]/span[1]/b/text()' link_xpath = '//*[@id="J_PicMode"]/li/div/a/@href' names = html.xpath(name_xpath) prices = html.xpath(price_xpath) links = html.xpath(link_xpath) data = [] for i in range(len(names)): item = {} item["name"] = names[i] item["price"] = prices[i] item["link"] = links[i] data.append(item) print(data) ```
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值