工作中需要用到省市区信息就搞了下!
import requests
from bs4 import BeautifulSoup
import time
import random
import xlsxwriter
global count
url ="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html"
#url ="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/12/1201.html"
count=1
fin_result=[]
result =[]
tag_name = ['统计用区划代码','名称']
citytrHref=[]
countytrHref=[]
def demo(url,test,ok):
global count
response = requests.get(url)
soup=BeautifulSoup(response.content.decode('gbk'),'lxml')
all_provinces = soup.findAll(attrs={'class',test})
href=''
for tr in all_provinces:
for td in tr.findAll('td'):
for all_province in td.findAll('a'):
name=all_province.text
href="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/"+all_province['href']
if name.isdigit()==True or count==1:
if count==1:
result.append('000000000000')
fin_result.append(name)
if count==0:
result.append(name)
else:
fin_result.append(name)
#fin_result['href'+all_province['href']]=href
print(name)
#print(href)
time.sleep(random.random()*8)
if count==1:
citytrHref.append(href)
if ok=='isok':
countytrHref.append(href)
count=0
def save_excel(fin_result,result,tag_name,file_name): # 将抓取到的信息存储到excel当中
book = xlsxwriter.Workbook(r'C:\Users\m1769\Desktop\%s.xls' % file_name) # 默认存储在桌面上
tmp = book.add_worksheet()
row_num = len(fin_result)
tag_pos = 'A%s' % 1
tmp.write_row(tag_pos,tag_name)
for i in range(2, row_num):
con_pos ='A%s' % i
con_pos1 ='B%s' % i
tmp.write_column(con_pos,result)
tmp.write_column(con_pos1,fin_result)
break
book.close()
if __name__ == '__main__':
demo(url,'provincetr','')
for i in range(0, len(citytrHref)):
demo(citytrHref[i],'citytr','isok')
for i in range(0, len(countytrHref)):
demo(countytrHref[i],'countytr','')
save_excel(fin_result,result,tag_name,'2018城市信息')
print('导出完成')