获取最新中国行政区划

废话少说,上代码:

import urlparse
from StringIO import StringIO
import datetime
import requests
import lxml
from lxml import etree

def get_latest_url(index_url):

    response=requests.get(index_url)

    parser=etree.HTMLParser()

    tree   = etree.parse(StringIO(response.content ), parser)

    r = tree.xpath('//ul[@class="center_list_contlist"]')
    if len(r)==1:
        div=r[0]
        href = div.xpath('li/a/@href')[0]
        return urlparse.urljoin(index_url,href)
    else:
        return None

def get_xingzhengquhua_text(latest_url, referer=None):

    response=requests.get(latest_url)
    parser= etree.HTMLParser()
    tree   = etree.parse(StringIO(response.content ), parser)
    r = tree.xpath('//div[@class="xilan_con"]')
    print r
    if len(r)==1:
        div=r[0]
        div2 = div.xpath('div/div')[0]
        div3 = div2.xpath('.//p')
        p=[]
        for line in div3:
            #line = line.replace(u'\xa0', u' ').strip()
            #if not line:
            #    continue
            try:
                code=line.xpath('span[1]/text()')[0]
                name=line.xpath('span[2]/text()')[0].strip(u'\u3000')
            except:
                continue

            if code.endswith('0000'):
                parent=''
            elif code.endswith('00'):
                parent=code[:2]+'0000'
            else:
                parent=code[:4]+'00'
            p.append((parent,code,name))

        text='\n'.join(map(lambda x:','.join(x),p))
        text=text.encode('utf-8')
        print text
        return text
    else:
        text=None

if __name__ == '__main__':
    index_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/'
    #latest_url='http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201401/t20140116_501070.html'
    latest_url=get_latest_url(index_url)
    print latest_url
    if latest_url:
        text=get_xingzhengquhua_text(latest_url)
        filename=latest_url.strip().split('/')[-1]
        print filename
        try:
            filename=filename.split('_')[0][1:]
        except:
            now=datetime.datetime.now()
            filename=now.strftime('%Y-%m-%d')

        if text:
            ff=open('latest-xingzhengquhua-%s.txt' % filename,'w')
            ff.write(text)
            ff.close()
        else:
            print 'Failed get xingzhengquehua data!'
    else:
        print 'Failed get latest data url'
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值