python爬取近年来人口数量并预测趋势_python人口预测-优快云博客

本文链接：https://blog.youkuaiyun.com/Stukdee/article/details/141751604

python爬取近年来人口数量并预测趋势

需要的库

re
urllib.request
numpy
xlwt
ssl
cutecharts

整体逻辑

获取网页
筛取内容
预测
放进表格(xls)
生成折线统计图

导入库

import re #提供正则表达式支持
import urllib.request #爬虫必备库其中之一
import numpy as np #用于预测数据
import xlwt #表格写入支持
import ssl #需要就对了
from cutecharts.charts import Line #生成折线统计图

获取网站hml

这里用到的网站是某人口统计网站

url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url)
html = response.read().decode('utf-8')

虽然逻辑上看起来没有问题，但是会报错
在这里插入图片描述

提取最根本的问题，就是最后一行那句”证书验证失败：证书链中的自签名证书“，原文是”urllib.error.URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1123)>
“，但是这个问题只有在SublimeText里不会发生
在这里插入图片描述

其它编辑器就不行了
在这里插入图片描述

同样的环境(python3.8.5)，但是在不同编辑器上运行效果不同，这个问题最好是解决一下，这里就需要用到ssl库了，更改后是这样的

context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url,context=context)
html = response.read().decode('utf-8')

看着一头雾水，其实就是无视证书，从根源解决了问题
在这里插入图片描述

提取网站数据

网站看起来花花绿绿的，但我们只需要这个表格，按下f12打开开发者模式，
并找到表格的位置
在这里插入图片描述

不难看出，表格是在一个table标签里，其中它的类(class)是table，还有一个宽度，
这里用到了正则表达式(Regular Expression)，正则表达式虽然缺点不少，但它优点也不少😝

pattern = r'<table class=\"table\" width=\"\d+\">(.*?)</table>' #提取网站中的表格
html = re.findall(pattern,html,re.DOTALL) #使用查找

在这里插入图片描述

findall是re库里用于查找的库，pattern里面的意思是：”将<table class=“table” width=任何数字>所有内容</table>里’所有内容‘提取出来‘“，这其中，\d+\是指一位数及以上的数字，(.*?)是指所有内容，在双引号前写”\“，是非常有必要的，比如
在这里插入图片描述

在桌面的abcdefg文件夹里有一个名为new txt的文本文档，现在我想知道里面的内容，于是我输入了cat new txt.txt，可是并没有预期的效果
在这里插入图片描述

”没有找到new文件或路径“”没有找到txt.txt文件或路径“，这个时候就需要用到转义字符\了
在这里插入图片描述

不只是空格，一些有特殊含义的符号都可使用\来将其转换成文本，接连它自己也一样
，现在要继续提取直到获得我们想要的内容，也就是人口数据，回到刚刚的的图片，可以发现数据都在td标签中，继续观察会发现里面都有一个width的定义，，为了方便提取数字，可以先将它去掉，那么

pattern = r'width=\"\d+\"' #替换所有width=任意数
html = re.sub(pattern,'',html[0]) #使用

在这里插入图片描述

最后一步就是提取所有的数字，注意，所有的数字包括小数和负数，尽管人口负数有点不现实，但是出于严谨，还是要算上的

pattern = r'[-+]?\d*\.\d+|[-+]?\d+' #提取正整数及正数小数和负整数和负数小数
html = re.findall(pattern,html) #使用

在这里插入图片描述

现在所有数据已经提取出来了，简单做个分类

population,ratio_male,ratio_female,population_urban = ([html[i] for i in range(j,len(html),4)] for j in range(4)) #轮流赋值，每4个一次
population = list(reversed([float(item) for item in population])) #将列表里所有内容变成浮点数的类型，下面3行一样
ratio_male = list(reversed([float(item) for item in ratio_male]))
ratio_female = list(reversed([float(item) for item in ratio_female]))
population_urban = list(reversed([float(item) for item in population_urban]))

预测未来5年的人口

这一步我搞了半天才完成，因为用到了一个叫线性回归（Linear Regression）的东西，这个东西看着难，其实就是求自变量与因变量的关系，通过线性回归总结一个最优的等价关系式，比如一个简单的一元线性方程 $y = m x + b$

def linear_regression(x,y) -> list: #定义一个返回值为列表的程式码
    ones = np.ones(x.shape) #创建全1数组
    A = np.column_stack([x,ones]) #构建增广矩阵
    AT_A = np.dot(A.T,A)
    AT_B = np.dot(A.T,y)
    w = np.linalg.solve(AT_A,AT_B) #求AT_A*w=AT_B中w的值
    return w
def predict_future_population(w,n_years,population) -> list:
    future_years = np.arange(n_years) + len(population) #生成一个从0开始，步长为1，长度为n_years的数组，每个元素都加上len(population)
    predictions = np.dot(future_years,w[0]) + w[1] #进行线性预测
    return predictions

这个程式码是在执行线性回归中的最小二乘法（Least Squares Method）来求解线性模型的参数，它使用了增广矩阵（Augmented Matrix）的方法来处理带有截距项（常数项）的线性回归问题。接下来就是使用这个程式码

years = np.arange(len(population))
w = linear_regression(years,population)
e = linear_regression(years,ratio_male)
r = linear_regression(years,ratio_female)
t = linear_regression(years,population_urban)
predicted_population = predict_future_population(w,5,population)
predicted_ratio_male = predict_future_population(e,5,ratio_male)
predicted_ratio_female = predict_future_population(r,5,ratio_female)
predicted_population_urban = predict_future_population(t,5,population_urban)

写入表格

workbook = xlwt.Workbook(encoding = 'utf-8') #这个太长了，懒得写注释了
worksheet = workbook.add_sheet('人口普查')
worksheet.write(0,0, label = '人口普查')
for i in range(len(population)):
    worksheet.write(0,i+1,label = f'第{i+1}次')
for i in range(len(predicted_population)):
    worksheet.write(0,i+len(population)+1,label = f'第{i+len(population)+1}次(预测)')
worksheet.write(1,0,label = '人口')
for i in range(len(population)):
    worksheet.write(1,i+1,label = population[i])
for i in range(len(predicted_population)):
    worksheet.write(1,i+len(population)+1,label = predicted_population[i])
worksheet.write(2,0,label = '男性比例')
for i in range(len(ratio_male)):
    worksheet.write(2,i+1,label = ratio_male[i])
for i in range(len(predicted_ratio_male)):
    worksheet.write(2,i+len(ratio_male)+1,label = predicted_ratio_male[i])
worksheet.write(3,0,label = '女性比例')
for i in range(len(ratio_female)):
    worksheet.write(3,i+1,label = ratio_female[i])
for i in range(len(predicted_ratio_female)):
    worksheet.write(3,i+len(ratio_female)+1,label = predicted_ratio_female[i])
worksheet.write(4,0,label = '城镇人口')
for i in range(len(population_urban)):
    worksheet.write(4,i+1,label = population_urban[i])
for i in range(len(predicted_population_urban)):
    worksheet.write(4,i+len(population_urban)+1,label = predicted_population_urban[i])
workbook.save('rkpc.xls')

创建折线统计图

population.extend(predicted_population) #extend函数是将一个列表连接到另外一个列表末尾
ratio_male.extend(predicted_ratio_male)
ratio_female.extend(predicted_ratio_female)
population_urban.extend(predicted_population_urban)
chart = Line("中国人口统计")
labelxx = list(range(1,len(population)+1))
chart.set_options(
    labels=labelxx,
    x_label="年份",
    y_label="人数",
    legend_pos="upRight"
)
population = [po/10000000 for po in population] #为了让”人口“与其它的数据同框而进行的操作
chart.add_series("人口(千万)",population)
chart.add_series("男性比例",ratio_male)
chart.add_series("女性比例",ratio_female)
chart.add_series("城镇人口",population_urban)
chart.render()

源码

import re
import urllib.request
import numpy as np
import xlwt
import ssl
from cutecharts.charts import Line
def linear_regression(x,y) -> list:
    ones = np.ones(x.shape)
    A = np.column_stack([x,ones])
    AT_A = np.dot(A.T,A)
    AT_B = np.dot(A.T,y)
    w = np.linalg.solve(AT_A,AT_B)
    return w
def predict_future_population(w,n_years,population) -> list:
    future_years = np.arange(n_years) + len(population)
    predictions = np.dot(future_years,w[0]) + w[1]
    return predictions
context = ssl.create_default_context()
context.check_hostname = False
context.verify_mode = ssl.CERT_NONE
url = 'https://www.hongheiku.com/china/221.html'
response = urllib.request.urlopen(url,context=context)
html = response.read().decode('utf-8')
pattern = r'<table class=\"table\" width=\"\d+\">(.*?)</table>'
html = re.findall(pattern,html,re.DOTALL)
pattern = r'width=\"\d+\"'
html = re.sub(pattern,'',html[0])
pattern = r'[-+]?\d*\.\d+|[-+]?\d+'
html = re.findall(pattern,html)
population,ratio_male,ratio_female,population_urban = ([html[i] for i in range(j,len(html),4)] for j in range(4))
population = list(reversed([float(item) for item in population]))
ratio_male = list(reversed([float(item) for item in ratio_male]))
ratio_female = list(reversed([float(item) for item in ratio_female]))
population_urban = list(reversed([float(item) for item in population_urban]))
years = np.arange(len(population))
w = linear_regression(years,population)
e = linear_regression(years,ratio_male)
r = linear_regression(years,ratio_female)
t = linear_regression(years,population_urban)
predicted_population = predict_future_population(w,5,population)
predicted_ratio_male = predict_future_population(e,5,ratio_male)
predicted_ratio_female = predict_future_population(r,5,ratio_female)
predicted_population_urban = predict_future_population(t,5,population_urban)
workbook = xlwt.Workbook(encoding = 'utf-8')
worksheet = workbook.add_sheet('人口普查')
worksheet.write(0,0, label = '人口普查')
for i in range(len(population)):
    worksheet.write(0,i+1,label = f'第{i+1}次')
for i in range(len(predicted_population)):
    worksheet.write(0,i+len(population)+1,label = f'第{i+len(population)+1}次(预测)')
worksheet.write(1,0,label = '人口')
for i in range(len(population)):
    worksheet.write(1,i+1,label = population[i])
for i in range(len(predicted_population)):
    worksheet.write(1,i+len(population)+1,label = predicted_population[i])
worksheet.write(2,0,label = '男性比例')
for i in range(len(ratio_male)):
    worksheet.write(2,i+1,label = ratio_male[i])
for i in range(len(predicted_ratio_male)):
    worksheet.write(2,i+len(ratio_male)+1,label = predicted_ratio_male[i])
worksheet.write(3,0,label = '女性比例')
for i in range(len(ratio_female)):
    worksheet.write(3,i+1,label = ratio_female[i])
for i in range(len(predicted_ratio_female)):
    worksheet.write(3,i+len(ratio_female)+1,label = predicted_ratio_female[i])
worksheet.write(4,0,label = '城镇人口')
for i in range(len(population_urban)):
    worksheet.write(4,i+1,label = population_urban[i])
for i in range(len(predicted_population_urban)):
    worksheet.write(4,i+len(population_urban)+1,label = predicted_population_urban[i])
workbook.save('rkpc.xls')
print(population,predicted_population)
print(ratio_male,predicted_ratio_male)
print(ratio_female,predicted_ratio_female)
print(population_urban,predicted_population_urban)
population.extend(predicted_population)
ratio_male.extend(predicted_ratio_male)
ratio_female.extend(predicted_ratio_female)
population_urban.extend(predicted_population_urban)
chart = Line("中国人口统计")
labelxx = list(range(1,len(population)+1))
chart.set_options(
    labels=labelxx,
    x_label="年份",
    y_label="人数",
    legend_pos="upRight"
)
population = [po/10000000 for po in population]
chart.add_series("人口(千万)",population)
chart.add_series("男性比例",ratio_male)
chart.add_series("女性比例",ratio_female)
chart.add_series("城镇人口",population_urban)
chart.render()