import requests,re,time,pymongo
from bs4 import BeautifulSoup as bs
#计数用
num = 0
str_time = time.time()
#连接mongodb
client = pymongo.MongoClient(host='localhost',port=27017)
db = client.chengxiang_daima
base_url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/"
url ="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html"
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
rsp = requests.get(url=url,headers=headers)
rsp_text = rsp.content.decode("gbk") #根据网页提示,进行解码方能出现中文数据
soup = bs(rsp_text,"lxml") #做汤,用于待会的提取
#print(soup)
shenfen_url_list = [] #用于存储省份的信息,方便遍历存储
shenfen_name_list = []
#提取省级城市和其代码
all_shengfen_name = soup.select('body .provincetr a')
for i in all_shengfen_name:
#提取每个省份的链接
#print(i.get_text)
shenfen_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(i))
shenfen_url = "".join(shenfen_url)
shenfen_url_list.append(shenfen_url)
#print(shenfen_url_list)
#提取每个省份的名称
shenfen_name = re.compile(r'>(.*?)<br/>',re.S).findall(str(i))
shenfen_name = "".join(shenfen_name)
shenfen_name_list.append(shenfen_name)
#print(shenfen_name_list)
#访问每个省份的网页,提取城市相关链接
for shenfen,shenfen_num in zip(shenfen_name_list,shenfen_url_list):
#print(shenfen)
#print(shenfen_num)
time.sleep(1)
chengshi_allurl = base_url + shenfen_num +".html" #拼接完整url
#print(chengshi_url)
chengshi_rsp = requests.get(url=chengshi_allurl,headers=headers).content.decode('gbk')
#print(chengshi_rsp)
chengshi_soup = bs(chengshi_rsp,'lxml') #做汤,开始提取
all_chengshi_name = chengshi_soup.select('body .citytr td a')[1:1000:2] #根据排列规律,选择方式要有所调试
#print(all_chengshi_name)
#提取城市的url
chengshi_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(all_chengshi_name))
#print(chengshi_url)
#提取城市的名称
chengshi_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_chengshi_name))
#print(chengshi_name)
#从城市页面访问每个城区页面
for chengshi,chengshi_num in zip(chengshi_name,chengshi_url):
time.sleep(1)
#print(chengshi,chengshi_num)
all_chengqu_url = base_url + chengshi_num + ".html"
#print(all_chengqu_url)
chengqu_rsp = requests.get(url=all_chengqu_url,headers=headers).content.decode('gbk')
#print(chengqu_rsp)
chengqu_soup = bs(chengqu_rsp,'lxml')
all_chengqu_name = chengqu_soup.select('body .countytr td a')[1:1000:2]
#print(all_chengqu_name)
#提取城区的url
chengqu_url = re.compile(r'<a href="(.*?).html">',re.S).findall(str(all_chengqu_name))
#print(chengqu_url)
#提取城区名称
chengqu_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_chengqu_name))
#print(chengqu_name)
#从城区页面访问每个街道页面
for chengqu,chengqu_num in zip(chengqu_name,chengqu_url):
time.sleep(1)
#print(shenfen_num , chengshi_num , chengqu_num)
all_jiedao_url = base_url +shenfen_num+ "/" + chengqu_num + ".html" #在这个层级链接格式有变化,注意排列组合的调整
#print(chengqu)
#print(all_jiedao_url)
jiedao_rsp = requests.get(url=all_jiedao_url,headers=headers).content.decode('gbk')
#print(jiedao_rsp)
#做汤,准备提取街道的相关信息
jiedao_soup = bs(jiedao_rsp,'lxml')
all_jiedao_name = jiedao_soup.select('body .towntr td a')[1:1000:2]
#print(all_jiedao_name)
#提取街道的url
jiedao_url = re.compile(r'href="(.*?).html">',re.S).findall(str(all_jiedao_name))
#print(jiedao_url)
#提取街道的名称
jiedao_name = re.compile(r'">(.*?)</a>',re.S).findall(str(all_jiedao_name))
#print(jiedao_name)
for xiangcun_num,xiangcun in zip(jiedao_url,jiedao_name):
time.sleep(2)
#print(shenfen_num , chengshi_num , chengqu_num,xiangcun_num)
all_xiangcun_url =base_url +shenfen_num+ "/"+"01/" + xiangcun_num +".html"
#print(all_xiangcun_url , xiangcun)
banshichu_rsp = requests.get(url=all_xiangcun_url,headers=headers).content.decode("gbk")
#print(banshichu_rsp)
#做汤,提取办事处的需要信息
banshichu_soup = bs(banshichu_rsp,'lxml')
all_banshichu_name = banshichu_soup.select('body .villagetr td')[2:1000:3]
#print(all_banshichu_name)
#提取出办事处的名字
banshichu_name = re.compile(r'<td>(.*?)</td>',re.S).findall(str(all_banshichu_name))
#print(banshichu_name)
try:
for banshichu in banshichu_name:
info = ({"省":shenfen,"市":chengshi,"区":chengqu,"乡":xiangcun,"街道":banshichu})
print(info)
if db["chengxiang_daima"].insert(info):
num = num +1
print("已存入"+ str(num) + "条信息")
except:
print(info)
print("报错")
continue
end_time = time.time()
running_time = end_time - str_time
print(running_time)
转载于:https://www.cnblogs.com/cwkcwk/p/9916940.html