这个东西主要是2020年上半年,疫情在家里上网课,某一门课程的期末大作业。作业参考了一篇分析SARS的论文1,将其中的部分方法应用于新冠疫情的数据,主要是计算莫兰指数。我的方向跟这个没啥关系,也不太清楚这些指标的具体含义,仅仅是将他们计算了出来,并稍作解释。下面就直接贴代码和参考文献了。
先贴一下参考文献部分,我也不太了解莫兰指数的具体细节,就不多叙述了,自己都是看这些博客尝试理解的
下面是数据处理和分析的主要过程,数据就不贴了,大家参考流程就好
#%%
import pandas as pd
#%%
# 得到累计确诊数据
globalData = pd.read_csv('time_series_covid19_confirmed_global.csv')
# 读入全球的时间序列累计数据
chinaData = globalData[ globalData['Country/Region'] == 'China']
# 选出中国的省份和地区,33个
taiwanData = globalData[ globalData['Country/Region'] == 'Taiwan*']
# 再选出台湾地区
taiwanData['Country/Region'] = 'China'
taiwanData['Province/State'] = 'Taiwan'
chinaData = pd.concat([chinaData, taiwanData])
# 合并成新的中国数据框
chinaData = chinaData.reset_index(drop=True)
# 对数据框的索引进行重新编排
chinaData.to_csv('ChinaData.csv',index=0)
#%%
import matplotlib.pyplot as plt
# 累计确诊画折线图
chinaData = pd.read_csv('D:\WHO\ChinaData.csv',low_memory=False)
datelist = list(chinaData.columns[4:119])
# 抽取1/22/20 - 5/15/20的疫情数据
for i in range(0,34):
x = datelist
y = list(chinaData.loc[i,datelist])
plt.plot(x,y)
plt.axvspan('1/22/20', '3/8/20',facecolor='#2E8B57',alpha=0.1)
plt.axvspan('3/8/20', '5/15/20',facecolor='#FFFF00',alpha=0.1)
plt.axvline(x='3/8/20')
plt.xticks(x, x, rotation=90)
plt.tick_params(labelsize=10)
plt.legend(range(0,34))
plt.show()
# 绘制折线图
#%%
# 今日确诊减去昨日确诊得到每日新增
increaseData = chinaData[chinaData.columns[0:4]]
datelist = list(chinaData.columns[4:119])
# 取出列名,即日期,形成列表
# 每一列减去前一列,即为该日新增
for i in range(1,115) :
increasenew = chinaData[datelist[i]]-chinaData[datelist[i-1]]
increasenew = pd.DataFrame(increasenew,columns=[datelist[i]])
increaseData = pd.concat([increaseData, increasenew],axis=1)
increaseData.to_csv('increaseData.csv',index=0)
#%%
# 给每日新增画曲线图
increaseData = pd.read_csv('increaseData.csv',low_memory=False)
datelist = list(increaseData.columns[4:118])
# 取出新增数据
for i in range(0,34):
x = datelist
y = list(increaseData.loc[i,datelist])
plt.plot(x,y)
plt.axvspan('1/23/20','3/8/20',facecolor='#2E8B57',alpha=0.1)
plt.axvspan('3/8/20','5/15/20',facecolor='#FFFF00',alpha=0.1)
plt.axvline(x='3/8/20')
plt.xticks(x, x, rotation=90)
plt.tick_params(labelsize=10)
plt.legend(range(0,34))
plt.show()
#%%
# 按阶段汇总确诊人数
chinaData = pd.read_csv('ChinaData.csv',low_memory=False)
statData = chinaData[chinaData.columns[0:4]]
statData['stage1'] = chinaData['3/8/20']
statData['stage2'] = chinaData['5/15/20'] - chinaData['3/8/20']
statData['total'] = chinaData['5/15/20']
#%%
# 拿到死亡人数
globalData = pd.read_csv('05-15-2020.csv')
chinaData = globalData[ globalData['Country_Region'] == 'China']
taiwanData = globalData[ globalData['Country_Region'] == 'Taiwan*']
taiwanData['Country/Region'] = 'China'
taiwanData['Province/State'] = 'Taiwan'
chinaData = pd.concat([chinaData, taiwanData])
chinaData = chinaData.reset_index(drop=True)
statData['death'] = chinaData['Deaths']
statData.to_csv('statData.csv',index=0)
#%%
# 计算空间权重矩阵
import pandas as pd
import numpy as np
from geopy.distance import geodesic
statData = pd.read_csv('D://WHO//statData.csv')
def DISTMatrix(SW,Dist):
for i in range(0,34):
#循环34个省份
for t in range(i+1,34):
# 循环其他省份
cityDistance = geodesic(
(statData.loc[i,'Lat'],statData.loc[i,'Long']),
(statData.loc[t,'Lat'],statData.loc[t,'Long'])
).km
# 计算两个地方的直线距离
# 输出结果的单位为Km
if(cityDistance<=Dist):
SW[i,t] = 1.0
SW[t,i] = 1.0
# 判断,若小于条件
# 则标记为1,视为接壤
# 该省对该省自身视为不接壤
# 定义空间权重矩阵
# 输入参数Dist
# 两个地方的距离小于Dist即视为接壤
# 由参数带入的矩阵SW
# 即为Dist条件下的
# 空间矩阵结果
def MatrixSelect(dist):
SWName = 'SW' + str(dist/1000)+'.csv'
SW = np.mat(np.zeros((34,34)))
DISTMatrix(SW,dist)
SW = pd.DataFrame(SW)
SW.to_csv('D://WHO//'+SWName,index=0,header=None)
# 根据传入的参数dist
# 输出空间权重矩阵的csv文件
setDist = 900
while (setDist<=2000):
MatrixSelect(setDist)
setDist = setDist + 100
# 计算900-2000为参数的各空间权重矩阵
#%%
import pandas as pd
import numpy as np
import Moran
statData = pd.read_csv('statData.csv')
stage1 = np.matrix(statData['stage1'])
stage2 = np.matrix(statData['stage2'])
total = np.matrix(statData['total'])
death = np.matrix(statData['death'])
DataList = [stage1,stage2,total,death]
# 读取四个阶段的数据,形成列表
# 作为指数计算的内层循环
# 即对外层每一个矩阵,计算一遍该列表
FileList = ['SWM.csv']
i = 0.9
while ( i < 2.1 ):
i = round(i,2)
FileName = 'SW'+str(i)+'.csv'
FileList.append(FileName)
i = i + 0.1
# 将权重矩阵的文件名形成列表,作为指数计算的外层循环
#%%
MoranChart = pd.DataFrame(columns=range(0,12))
# 创建数据框存储莫兰值,Z值,和p值
for i in range(0,13) :
# 循环13个空间权重矩阵
SW = pd.read_csv(FileList[i],header=None)
# 读取文件
SW = np.matrix(SW.values)
# 转换成矩阵
##########################
print()
print(i)
##########################
for t in range(0,4):
# 循环四个数据组
MoranChart.loc[i,(3*t):(3*t)+2] = Moran.Index(DataList[t], SW)
MoranChart.to_csv('D://WHO//MoranChart.csv',
index=0,header=None)
#%%
# 选出各阶段p值最小的数据,绘制莫兰散点图
SW = pd.read_csv('SW1.5.csv',header=None)
SW = np.matrix(SW.values)
Moran.Plot(stage1,SW)
Moran.Plot(total,SW)
Moran.Plot(death,SW)
SW = pd.read_csv('SW0.9.csv',header=None)
SW = np.matrix(SW.values)
Moran.Plot(stage2,SW)
下面是计算莫兰指数等用到的模块,我也是从别的地方找的,然后抽取出来这些部分
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
def Index(X,W):
#x为行矩阵,W为权重矩阵(0-1矩阵,且对称)
#X = np.matrix([8,6,6,3,2])
#W = np.matrix([[0,1,1,0,0],[1,0,1,1,0],[1,1,0,1,0],[0,1,1,0,1],[0,0,0,1,0]])
#预处理
#X = np.matrix.transpose(X)
X = X.T
#对输入的行矩阵进行转置
X = X/1.0
#保证X中的数据为浮点型
W = W/1.0
#保证W中的数据为浮点型
# # 求向量X的长度
n = len(X)
# # 求矩阵W每行之和
# W_sum = np.sum(W,axis = 1)#axis=1表示对矩阵的行求和
# W_standard = W/(W_sum*np.ones(n))#对W进行标准化
# 对矩阵进行标准化
Wsum = W.sum(axis=1)
# 矩阵按行求和
Locate0 = np.where(Wsum == 0)[0]
# 取出和为0的行号
Wsum[Locate0] = 1.0
# 将这些行的和赋值为1.0
# 否则0会出现在分母上
W_standard = W/Wsum
# 进行标准化
#对x进行标准化
xm = np.mean(X)
sx = np.std(X)
z = (X-xm)/sx
#求标准化后的纵坐标,这里的x和W都进行了标准化
Wz = W_standard*z
#z = np.matrix.transpose(z)
#A = np.vstack([z, np.ones(n)]).T
#求线性回归系数,得出的直线斜率就是moran'I值
z_lstsq = np.linalg.lstsq(z,Wz,rcond=-1)
#moran'I值检验
WW = W_standard
E_I = -1.0/(n-1)
S0 = np.sum(WW)
WW1 =np.multiply(WW+WW.T,WW+WW.T)
S1 = np.sum(WW1)/2.0
WW2 = np.sum((WW+WW.T),axis = 1)
S2 = np.sum(np.multiply(WW2,WW2))
Var_I = (n*n*S1-n*S2+3*S0*S0)/((n*n-1)*S0*S0)-E_I*E_I
Z_I = (float(z_lstsq[0])-E_I)/np.sqrt(Var_I)
# 计算该z值对应的p值
Z_p = norm.cdf(Z_I)
print('moran`I值为:',float(z_lstsq[0]))
print('Z值为:',Z_I)
print('p值为',Z_p)
return float(z_lstsq[0]),Z_I,Z_p
# 依次返回 莫兰值,Z检验值,P概率值
def Plot(X,W):
#x为行矩阵,W为权重矩阵(0-1矩阵,且对称)
#X = np.matrix([8,6,6,3,2])
#W = np.matrix([[0,1,1,0,0],[1,0,1,1,0],[1,1,0,1,0],[0,1,1,0,1],[0,0,0,1,0]])
#预处理
#X = np.matrix.transpose(X)
X = X.T
#对输入的行矩阵进行转置
X = X/1.0
#保证X中的数据为浮点型
W = W/1.0
#保证W中的数据为浮点型
# # 求矩阵W每行之和
# W_sum = np.sum(W,axis = 1)#axis=1表示对矩阵的行求和
# W_standard = W/(W_sum*np.ones(n))#对W进行标准化
# 对矩阵进行标准化
Wsum = W.sum(axis=1)
# 对矩阵按行求和
Locate0 = np.where(Wsum == 0)[0]
# 找出和为0的行
Wsum[Locate0] = 1.0
# 讲该行的和赋值为1.0
# 否则即0出现在分母位置
W_standard = W/Wsum
# 对矩阵标准化
#对x进行标准化
xm = np.mean(X)
sx = np.std(X)
z = (X-xm)/sx
#求标准化后的纵坐标,这里的x和W都进行了标准化
Wz = W_standard*z
#z = np.matrix.transpose(z)
#A = np.vstack([z, np.ones(n)]).T
#求线性回归系数,得出的直线斜率就是moran'I值
z_lstsq = np.linalg.lstsq(z,Wz,rcond=-1)
#求拟合值,拟合的直线一定过原点
z_z = z*z_lstsq[0]
#画moran'I散点图
plt.tick_params(labelsize=30)
plt.plot(z,Wz,'ro')
plt.plot(z,z_z)
plt.grid()
plt.xlabel('x axis',fontsize=30)
plt.ylabel('Wx axis',fontsize=30)
plt.title('Moran`I Scatter Plot',fontsize=30)
plt.show()