python爬取近年来人口数量并预测趋势
需要的库
- re
- urllib.request
- numpy
- xlwt
- ssl
- cutecharts
整体逻辑
- 获取网页
- 筛取内容
- 预测
- 放进表格(xls)
- 生成折线统计图
导入库
import re #提供正则表达式支持
import urllib.request #爬虫必备库其中之一
import numpy as np #用于预测数据
import xlwt #表格写入支持
import ssl #需要就对了
from cutecharts.charts import Line #生成折线统计图
获取网站hml
这里用到的网站是某人口统计网站
url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')
虽然逻辑上看起来没有问题,但是会报错
提取最根本的问题,就是最后一行那句”证书验证失败:证书链中的自签名证书“,原文是”urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1123)>
“,但是这个问题只有在SublimeText里不会发生
其它编辑器就不行了
同样的环境(python3.8.5),但是在不同编辑器上运行效果不同,这个问题最好是解决一下,这里就需要用到ssl库了,更改后是这样的
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url,context=context)
html = response.read().decode('utf-8')
看着一头雾水,其实就是无视证书,从根源解决了问题
提取网站数据
网站看起来花花绿绿的,但我们只需要这个表格,按下f12打开开发者模式,
并找到表格的位置
不难看出,表格是在一个table标签里,其中它的类(class)是table,还有一个宽度,
这里用到了正则表达式(Regular Expression),正则表达式虽然缺点不少,但它优点也不少😝
pattern = r'<table class=\"table\" width=\"\d+\">(.*?)</table>' #提取网站中的表格
html = re.findall(pattern,html,re.DOTALL) #使用查找
findall是re库里用于查找的库,pattern里面的意思是:”将<table class=“table” width=任何数字>所有内容</table>里’所有内容‘提取出来‘“,这其中,\d+\是指一位数及以上的数字,(.*?)是指所有内容,在双引号前写”\“,是非常有必要的,比如
在桌面的abcdefg文件夹里有一个名为new txt的文本文档,现在我想知道里面的内容,于是我输入了cat new txt.txt,可是并没有预期的效果
”没有找到new文件或路径“”没有找到txt.txt文件或路径“,这个时候就需要用到转义字符\了
不只是空格,一些有特殊含义的符号都可使用\来将其转换成文本,接连它自己也一样
,现在要继续提取直到获得我们想要的内容,也就是人口数据,回到刚刚的的图片,可以发现数据都在td标签中,继续观察会发现里面都有一个width的定义,,为了方便提取数字,可以先将它去掉,那么
pattern = r'width=\"\d+\"' #替换所有width=任意数
html = re.sub(pattern,'',html[0]) #使用
最后一步就是提取所有的数字,注意,所有的数字包括小数和负数,尽管人口负数有点不现实,但是出于严谨,还是要算上的
pattern = r'[-+]?\d*\.\d+|[-+]?\d+' #提取正整数及正数小数和负整数和负数小数
html = re.findall(pattern,html) #使用
现在所有数据已经提取出来了,简单做个分类
population,ratio_male,ratio_female,population_urban = ([html[i] for i in range(j,len(html),4)] for j in range(4)) #轮流赋值,每4个一次
population = list(reversed([float(item) for item in population])) #将列表里所有内容变成浮点数的类型,下面3行一样
ratio_male = list(reversed([float(item) for item in ratio_male]))
ratio_female = list(reversed([float(item) for item in ratio_female]))
population_urban = list(reversed([float(item) for item in population_urban]))
预测未来5年的人口
这一步我搞了半天才完成,因为用到了一个叫线性回归(Linear Regression)的东西,这个东西看着难,其实就是求自变量与因变量的关系,通过线性回归总结一个最优的等价关系式,比如一个简单的一元线性方程 y = m x + b y=mx+b y=mx+b
def linear_regression(x,y) -> list: #定义一个返回值为列表的程式码
ones = np.ones(x.shape) #创建全1数组
A = np.column_stack([x,ones]) #构建增广矩阵
AT_A = np.dot(A.T,A)
AT_B = np.dot(A.T,y)
w = np.linalg.solve(AT_A,AT_B) #求AT_A*w=AT_B中w的值
return w
def predict_future_population(w,n_years,population) -> list:
future_years = np.arange(n_years) + len(population) #生成一个从0开始,步长为1,长度为n_years的数组,每个元素都加上len(population)
predictions = np.dot(future_years,w[0]) + w[1] #进行线性预测
return predictions
这个程式码是在执行线性回归中的最小二乘法(Least Squares Method)来求解线性模型的参数,它使用了增广矩阵(Augmented Matrix)的方法来处理带有截距项(常数项)的线性回归问题。接下来就是使用这个程式码
years = np.arange(len(population))
w = linear_regression(years,population)
e = linear_regression(years,ratio_male)
r = linear_regression(years,ratio_female)
t = linear_regression(years,population_urban)
predicted_population = predict_future_population(w,5,population)
predicted_ratio_male = predict_future_population(e,5,ratio_male)
predicted_ratio_female = predict_future_population(r,5,ratio_female)
predicted_population_urban = predict_future_population(t,5,population_urban)
写入表格
workbook = xlwt.Workbook(encoding = 'utf-8') #这个太长了,懒得写注释了
worksheet = workbook.add_sheet('人口普查')
worksheet.write(0,0, label = '人口普查')
for i in range(len(population)):
worksheet.write(0,i+1,label = f'第{i+1}次')
for i in range(len(predicted_population)):
worksheet.write(0,i+len(population)+1,label = f'第{i+len(population)+1}次(预测)')
worksheet.write(1,0,label = '人口')
for i in range(len(population)):
worksheet.write(1,i+1,label = population[i])
for i in range(len(predicted_population)):
worksheet.write(1,i+len(population)+1,label = predicted_population[i])
worksheet.write(2,0,label = '男性比例')
for i in range(len(ratio_male)):
worksheet.write(2,i+1,label = ratio_male[i])
for i in range(len(predicted_ratio_male)):
worksheet.write(2,i+len(ratio_male)+1,label = predicted_ratio_male[i])
worksheet.write(3,0,label = '女性比例')
for i in range(len(ratio_female)):
worksheet.write(3,i+1,label = ratio_female[i])
for i in range(len(predicted_ratio_female)):
worksheet.write(3,i+len(ratio_female)+1,label = predicted_ratio_female[i])
worksheet.write(4,0,label = '城镇人口')
for i in range(len(population_urban)):
worksheet.write(4,i+1,label = population_urban[i])
for i in range(len(predicted_population_urban)):
worksheet.write(4,i+len(population_urban)+1,label = predicted_population_urban[i])
workbook.save('rkpc.xls')
创建折线统计图
population.extend(predicted_population) #extend函数是将一个列表连接到另外一个列表末尾
ratio_male.extend(predicted_ratio_male)
ratio_female.extend(predicted_ratio_female)
population_urban.extend(predicted_population_urban)
chart = Line("中国人口统计")
labelxx = list(range(1,len(population)+1))
chart.set_options(
labels=labelxx,
x_label="年份",
y_label="人数",
legend_pos="upRight"
)
population = [po/10000000 for po in population] #为了让”人口“与其它的数据同框而进行的操作
chart.add_series("人口(千万)",population)
chart.add_series("男性比例",ratio_male)
chart.add_series("女性比例",ratio_female)
chart.add_series("城镇人口",population_urban)
chart.render()
源码
import re
import urllib.request
import numpy as np
import xlwt
import ssl
from cutecharts.charts import Line
def linear_regression(x,y) -> list:
ones = np.ones(x.shape)
A = np.column_stack([x,ones])
AT_A = np.dot(A.T,A)
AT_B = np.dot(A.T,y)
w = np.linalg.solve(AT_A,AT_B)
return w
def predict_future_population(w,n_years,population) -> list:
future_years = np.arange(n_years) + len(population)
predictions = np.dot(future_years,w[0]) + w[1]
return predictions
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url,context=context)
html = response.read().decode('utf-8')
pattern = r'<table class=\"table\" width=\"\d+\">(.*?)</table>'
html = re.findall(pattern,html,re.DOTALL)
pattern = r'width=\"\d+\"'
html = re.sub(pattern,'',html[0])
pattern = r'[-+]?\d*\.\d+|[-+]?\d+'
html = re.findall(pattern,html)
population,ratio_male,ratio_female,population_urban = ([html[i] for i in range(j,len(html),4)] for j in range(4))
population = list(reversed([float(item) for item in population]))
ratio_male = list(reversed([float(item) for item in ratio_male]))
ratio_female = list(reversed([float(item) for item in ratio_female]))
population_urban = list(reversed([float(item) for item in population_urban]))
years = np.arange(len(population))
w = linear_regression(years,population)
e = linear_regression(years,ratio_male)
r = linear_regression(years,ratio_female)
t = linear_regression(years,population_urban)
predicted_population = predict_future_population(w,5,population)
predicted_ratio_male = predict_future_population(e,5,ratio_male)
predicted_ratio_female = predict_future_population(r,5,ratio_female)
predicted_population_urban = predict_future_population(t,5,population_urban)
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('人口普查')
worksheet.write(0,0, label = '人口普查')
for i in range(len(population)):
worksheet.write(0,i+1,label = f'第{i+1}次')
for i in range(len(predicted_population)):
worksheet.write(0,i+len(population)+1,label = f'第{i+len(population)+1}次(预测)')
worksheet.write(1,0,label = '人口')
for i in range(len(population)):
worksheet.write(1,i+1,label = population[i])
for i in range(len(predicted_population)):
worksheet.write(1,i+len(population)+1,label = predicted_population[i])
worksheet.write(2,0,label = '男性比例')
for i in range(len(ratio_male)):
worksheet.write(2,i+1,label = ratio_male[i])
for i in range(len(predicted_ratio_male)):
worksheet.write(2,i+len(ratio_male)+1,label = predicted_ratio_male[i])
worksheet.write(3,0,label = '女性比例')
for i in range(len(ratio_female)):
worksheet.write(3,i+1,label = ratio_female[i])
for i in range(len(predicted_ratio_female)):
worksheet.write(3,i+len(ratio_female)+1,label = predicted_ratio_female[i])
worksheet.write(4,0,label = '城镇人口')
for i in range(len(population_urban)):
worksheet.write(4,i+1,label = population_urban[i])
for i in range(len(predicted_population_urban)):
worksheet.write(4,i+len(population_urban)+1,label = predicted_population_urban[i])
workbook.save('rkpc.xls')
print(population,predicted_population)
print(ratio_male,predicted_ratio_male)
print(ratio_female,predicted_ratio_female)
print(population_urban,predicted_population_urban)
population.extend(predicted_population)
ratio_male.extend(predicted_ratio_male)
ratio_female.extend(predicted_ratio_female)
population_urban.extend(predicted_population_urban)
chart = Line("中国人口统计")
labelxx = list(range(1,len(population)+1))
chart.set_options(
labels=labelxx,
x_label="年份",
y_label="人数",
legend_pos="upRight"
)
population = [po/10000000 for po in population]
chart.add_series("人口(千万)",population)
chart.add_series("男性比例",ratio_male)
chart.add_series("女性比例",ratio_female)
chart.add_series("城镇人口",population_urban)
chart.render()