国家统计局全国城乡街道信息。保存入Mongodb

本文介绍了一种使用Python爬虫技术抓取中国各省份、城市、区县、街道及社区详细信息的方法,通过解析国家统计局网站数据,利用requests、BeautifulSoup、pymongo等库实现了数据的获取与存储。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import requests,re,time,pymongo
from bs4 import BeautifulSoup as bs

#计数用
num = 0
str_time = time.time()

#连接mongodb
client = pymongo.MongoClient(host='localhost',port=27017)
db = client.chengxiang_daima

base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/"
url ="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html"

headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
rsp = requests.get(url=url,headers=headers)
rsp_text = rsp.content.decode("gbk") #根据网页提示,进行解码方能出现中文数据

soup = bs(rsp_text,"lxml") #做汤,用于待会的提取
#print(soup)

shenfen_url_list = [] #用于存储省份的信息,方便遍历存储
shenfen_name_list = []
#提取省级城市和其代码
all_shengfen_name = soup.select('body .provincetr a')
for i in all_shengfen_name:
#提取每个省份的链接
#print(i.get_text)
shenfen_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(i))
shenfen_url = "".join(shenfen_url)
shenfen_url_list.append(shenfen_url)
#print(shenfen_url_list)

#提取每个省份的名称
shenfen_name = re.compile(r'>(.*?)<br/>',re.S).findall(str(i))
shenfen_name = "".join(shenfen_name)
shenfen_name_list.append(shenfen_name)
#print(shenfen_name_list)

#访问每个省份的网页,提取城市相关链接
for shenfen,shenfen_num in zip(shenfen_name_list,shenfen_url_list):
#print(shenfen)
#print(shenfen_num)
time.sleep(1)
chengshi_allurl = base_url + shenfen_num +".html" #拼接完整url
#print(chengshi_url)
chengshi_rsp = requests.get(url=chengshi_allurl,headers=headers).content.decode('gbk')
#print(chengshi_rsp)

chengshi_soup = bs(chengshi_rsp,'lxml') #做汤,开始提取
all_chengshi_name = chengshi_soup.select('body .citytr td a')[1:1000:2] #根据排列规律,选择方式要有所调试
#print(all_chengshi_name)

#提取城市的url
chengshi_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(all_chengshi_name))
#print(chengshi_url)
#提取城市的名称
chengshi_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_chengshi_name))
#print(chengshi_name)

#从城市页面访问每个城区页面
for chengshi,chengshi_num in zip(chengshi_name,chengshi_url):
time.sleep(1)
#print(chengshi,chengshi_num)
all_chengqu_url = base_url + chengshi_num + ".html"
#print(all_chengqu_url)
chengqu_rsp = requests.get(url=all_chengqu_url,headers=headers).content.decode('gbk')
#print(chengqu_rsp)

chengqu_soup = bs(chengqu_rsp,'lxml')
all_chengqu_name = chengqu_soup.select('body .countytr td a')[1:1000:2]
#print(all_chengqu_name)

#提取城区的url
chengqu_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(all_chengqu_name))
#print(chengqu_url)
#提取城区名称
chengqu_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_chengqu_name))
#print(chengqu_name)

#从城区页面访问每个街道页面
for chengqu,chengqu_num in zip(chengqu_name,chengqu_url):
time.sleep(1)
#print(shenfen_num , chengshi_num , chengqu_num)
all_jiedao_url = base_url +shenfen_num+ "/" + chengqu_num + ".html" #在这个层级链接格式有变化,注意排列组合的调整
#print(chengqu)
#print(all_jiedao_url)
jiedao_rsp = requests.get(url=all_jiedao_url,headers=headers).content.decode('gbk')
#print(jiedao_rsp)

#做汤,准备提取街道的相关信息
jiedao_soup = bs(jiedao_rsp,'lxml')
all_jiedao_name = jiedao_soup.select('body .towntr td a')[1:1000:2]
#print(all_jiedao_name)

#提取街道的url
jiedao_url = re.compile(r'href="(.*?).html">',re.S).findall(str(all_jiedao_name))
#print(jiedao_url)
#提取街道的名称
jiedao_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_jiedao_name))
#print(jiedao_name)

for xiangcun_num,xiangcun in zip(jiedao_url,jiedao_name):
time.sleep(2)
#print(shenfen_num , chengshi_num , chengqu_num,xiangcun_num)

all_xiangcun_url =base_url +shenfen_num+ "/"+"01/" + xiangcun_num +".html"
#print(all_xiangcun_url , xiangcun)

banshichu_rsp = requests.get(url=all_xiangcun_url,headers=headers).content.decode("gbk")
#print(banshichu_rsp)

#做汤,提取办事处的需要信息
banshichu_soup = bs(banshichu_rsp,'lxml')
all_banshichu_name = banshichu_soup.select('body .villagetr td')[2:1000:3]
#print(all_banshichu_name)
#提取出办事处的名字
banshichu_name = re.compile(r'<td>(.*?)</td>',re.S).findall(str(all_banshichu_name))
#print(banshichu_name)
try:
for banshichu in banshichu_name:
info = ({"省":shenfen,"市":chengshi,"区":chengqu,"乡":xiangcun,"街道":banshichu})
print(info)
if db["chengxiang_daima"].insert(info):
num = num +1
print("已存入"+ str(num) + "条信息")

except:
print(info)
print("报错")
continue
end_time = time.time()
running_time = end_time - str_time
print(running_time)

转载于:https://www.cnblogs.com/cwkcwk/p/9916940.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值