数据处理
import pandas as pd # 读 取 数 据
df=pd read_excel(附件1:污染物浓度数据 .xlsx')
print(df)
df['date']=df['年].map(str)+"/"+df[月'].map(str)+"/"+df[日'].map(str)
pd.to_datetime(df['date'])
#打印查看效果
print(df['date'])
data2=pd.read_excel(附件2:气象数据 .xlsx')
data2[date']=data2['V04001].map(str)+"/"+data2['V04002'].map(str)+"/"+data2['V04003'].map(str) pd.to_datetime(data2['date'])
#将结果输出在另一个csv 文件中 df=df.drop(labels= '年',axis=1)
df=df.drop(labels= '月',axis=1)
df=df.drop(labels= '日',axis=1)
filename= '附件一污染物浓度.csv' df.to_csv(filename,encoding='gb2312')
#将结果输出在另一个csv 文件中 data2=data2.drop(labels='V04001,axis=1)
data2=data2.drop(labels='V04002',axis=1)
data2=data2.drop(labels='V04003',axis=1) filename= '附件二气象数据.csv
data2.to_csv(filename,encoding='gb2312')
import numpy as np
from scipy import interpolate import matplotlib.pyplot as plt import time,datetimefrom datetime import datetime,date,timedelta
import math
# 插 值
def interpolation_value(data_gp):
m
对时序数据进行插值,断开的数据用三次样条插值,不足数目的往前取均值插 param data_gp:id 分组后的dataframe
return:df
"
x=data_gp['date_time'].values.tolist()
y=data_gp['ecpm_tomorrow].values.tolist()
#print(x)
#print(y)
#plt.scatter(x,y)
#plt.plot(x,y)
#plt.show(
#获取需要插值的时间
lxs=len(x)
insert_x=1 #插值时间列表
mean_x=[] #往前插均值时间列表
isx=0
flag=10
if lxs<10:
#判断是否是连续日期,并对不连续的日期进行时间插值
for iin range(len(x)):
ifi+1==len(x):
break
t1=int(time.mktime(time.strptime(x[i],"%Y-%m-%d"))
t2=int(time.mktime(time.strptime(x[i+1],"%Y-%m-%d"))
differ=(datetime.fromtimestamp(t2)-datetime.fromtimestamp(t1)).days
#print(" 相差",differ," 天")
while differ !=1:
differ -=1
tmp=(datetime.fromtimestamp(t2)+
timedelta(days=-differ)).strftime("%Y-%m-%d")
insert_x.append(tmp)
isx=len(insert_x)
tos=isx+lxs
#如果不够10个点,往前插取均值:如第一个是现有数据前2个的均值、第二个是 现有数据前3个的均值
if tos<math.floor(lxs/2+1)+lxs:
flag=math.floor(lxs/2+1)
diffs=flag
timx0=int(time.mktime(time.strptime(x[0],"%Y-%m-%d"))
while diffs !=0:
tmp=(datetime.fromtimestamp(timx0)+
timedelta(days=-diffs)).strftime("%Y-%m-%d")
mean_x.append(tmp)
diffs-=1
#print(insert_x)
#print(mean_x)
#将时间变为数字,保存对应的时间,便于插值
newxlist=x+insert_x+mean_x
newxlist=sorted(newxlist)
#print(newxlist)
#xydict=}
#for i in range(len(x):
# xydict[x[i]]=y[i]xdict=0 #插值后的时间x
resx_dict=0 #存放插值的结果列表, key: 时间,value:ecpm_yesterday
x_list=1 # 原x 转为对应数字
x_i_list= # 待 插 值x 转为对应数字
x_m_list=1 j=0 #往前插均值x 转为对应数字
for i in range(len(newxlist)):
xdict[newxlist[i]]=i+1
if newxlist[i]in x:
x_list.append(xdict[newxlist[i]])
resx_dict[newxlist[i]]=y[i]
j+=1
elif newxlist[i]in insert_x:
x_i_list.append(xdict[newxlist[i]]) elif newxlist[i]in mean_x:
x_m_list.append(xdict[newxlist[i]])
#print(xdict)
#print(x_list)
#print(x i list)
#print(x_m_list)
#print(resx_dict)
#得到差值函数 linear: 线性插值 cubic:三次样条插值
#Flinear =interpolate.interpld(x_list,y,kind='linear')
Flinear=interpolate.interpld(x_list,y,kind='cubic')
#三次样条插值 if len(x_i_list)!=0:
ynew=Flinear( x i list)
ynew=np.array(ynew).tolist(
ynew=[abs(round(xi,4)for xi in ynew]
j=0
for i inx_i_list:
k=[k for k,v in xdict.items()if v=i][0]
resx_dict[k]=ynew[j]
j+=1
#往前取均值插
if len(x_m_list)!=0:
for iinx_m_list:
k=[k for k,v in xdict.items(ifv==i][0]
tmp=xdict[k]+1
value =round(sum(y[:tmp])/tmp,4)
resx_dict[k]=value
resx_dict=sorted(resx_dict.items(),key=lambda x:x[0],reverse=False) resx_dict=dict(resx_dict)
#print(resx_dict)
resx_list,resy_list=[], 口
for k,v in resx_dict.items():
resx_list.append(k)
resy_list.append(v)
#plt.scatter(resx_list,resy_list)
#plt.plot(resx_list,resy_list)
#plt.show(
df={
'date_time':resx_list,
'ecpm_tomorrow:resy_list,
}
data=pd.DataFrame(df)
return data
import pandas as pd
from fancyimpute import Iterativelmputer# 读 取 Excel 文 件
df=pd.read_excel (附件一污染物浓度(无等级).xlsx')
#按照时间顺序排序
df=df.sort_values(by='date')
#进行多值插补
imputer=IterativeImputer()
imputed_df=pd.DataFrame(imputer.fit_transform(df.iloc[:,0:7]))
#将插补后的数据合并回原始数据框 df.iloc[:,0:7]=imputed_df.values
#保存插补后的数据
df.to_excel(附件一污染物浓度(无等级)(插补).xlsx',encoding='gb2312',index=False)
import pandas as pd# 导 入 pandas 库
#生成异常数据
data=pd.read_csv (附件二气象数据 .csv') print(data)# 打印输出
import pandas as pd # 导入数据分析库Pandas #print(data.describe())
outputfile= '描述.xls'# 输出数据路径
data.describe().to_excel(outputfile,index=True)
import matplotlib.pyplot as plt # 导入图像库
plt.rcParams['font.sans-serif]=['SimHei]# 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
import matplotlib.pyplot as plt # 导入图像库
plt.rcParams['font.sans-serif]=['SimHei']# 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
plt.subplot(1,2,1)
data[["V11291_700"].boxplot(return_type='dict')
plt.subplot(1,2,2)
data[["V12001_700"]].boxplot(return_type='dict')
plt.show)
plt.subplot(1,2,1)
data[["V13305"]].boxplot(return_type='dict')
plt.subplot(1,2,2)
data[["V10004_700"].boxplot(return_type='dict')
plt.show()
import matplotlib.pyplot as plt # 导入图像库
plt.rcParams['font.sans-serif]=['SimHei']# 用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
plt.subplot(1,2,1)
data[["V13003_700"]].boxplot(return_type='dict')
plt.show()
import numpy as np
a=data["V13305"].quantile(0.75)
b=data["V13305"].quantile(0.25) c=data["V13305"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V13305"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V13305"].mean()+data["V13305"].std()*4 b=data["V13305"].mean()-data["V13305"].std()*4
c=data["V13305"]
c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True)data[[import numpy as np
a=data["V10004_700"].quantile(0.75)
b=data["V10004_700"].quantile(0.25) c=data["V10004_700"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V10004_700"]]=c
#data.dropna(how='any',axis=0,inplace=True) import numpy as np
a=data["V10004_700"].quantile(0.75)
b=data["V10004_700"].quantile(0.25) c=data["V10004_700"]
c[(c>=(a-b)*1.5+a)(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V10004_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V10004_700"].mean()+data["V10004_700"].std()*4 b=data["V10004_700"].mean()-data["V10004_700"].std()*4
c=data["V10004_700"] c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True) data[["V10004_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V10004_700"].mean()+data["V10004_700"].std(*4 b=data["V10004_700"].mean()-data["V10004_700"].std()*4
c=data["V10004_700"] c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True) data[["V10004_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)"V13305"]]=c
玉
#data.dropna(how='any',axis=0,inplace=True) import numpy as np
a=data["V11291_700"].quantile(0.75)
b=data["V11291_700"].quantile(0.25) c=data["V11291_700"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V11291_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V11291_700"].mean()+data["V11291_700"].std()*4 b=data["V11291_700"].mean()-data["V11291_700"].std()*4 c=data["V11291_700"]
c[(c>=a)|(c<=b)]import numpy as np
a=data["V12001_700"].quantile(0.75)
b=data["V12001_700"].quantile(0.25) c=data["V12001_700"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V12001_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V12001_700"].mean()+data["V12001_700"].std(*4 b=data["V12001_700"].mean()-data["V12001_700"].std(*4 c=data["V12001_700"]
c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True) data[["V12001_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)=np.nan c.fillna(c.median(),inplace=True)
data[["V11291_700"]]=c#data.dropna(how='any',axis=0,inplace=True) import numpy as np
a=data["V12001_700"].quantile(0.75)
b=data["V12001_700"].quantile(0.25) c=data["V12001_700"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V12001_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V12001_700"].mean()+data["V12001_700"].std(*4 b=data["V12001_700"].mean()-data["V12001_700"].std()*4
c=data["V12001_700"] c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True) data[["V12001_700"]]=c
#data.dropna(how='any',axis=0,inplace=True) import numpy as np
a=data["V13003_700"].quantile(0.75)
b=data["V13003_700"].quantile(0.25) c=data["V13003_700"]
c[(c>=(a-b)*1.5+a)|(c<=b-(a-b)*1.5)]=np.nan c.fillna(c.median(),inplace=True)
data[["V13003_700"]]=c
#data.dropna(how='any',axis=0,inplace=True)
a=data["V13003_700"].mean()+data["V13003_700"].std()*4 b=data["V13003_700"].mean()-data["V13003_700"].std)*4
c=data["V13003_700"] c[(c>=a)|(c<=b)]=np.nan
c.fillna(c.median(),inplace=True)
data[["V13003_700"]]=c
#data.dropna(how='any',axis=0,inplace=True) #XGBOOST
import matplotlib.pyplot as plt from pandas import read_excel
from pandas import read_csv import numpy as np
from pandas import DataFrame from pandas import concat
from sklearn.preprocessing import MinMaxScaler import pandas as pd
df=pd.read_csv (总变量(无AQI 等级).csv',encoding='gb2312)
df.drop(['date'],axis=1,inplace=True)
columns=df.columns.tolist() df
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler #创建 StandardScaler() 实例
standard_sl=StandardScaler()
#将 DataFrame 格式的数据按照每一个series 分别标准化 standard_sl_data=standard_s1.fit_transform(df)
#将标准化后的数据改成DataFrame 格式
df=pd.DataFrame(standard_sl_data,columns=columns)
print(df.head()
#通过如下代码将特征变量和目标变量单独提取出来,代码如下:
X=df.drop(columns='PM2.5') y=df['PM2.5']
#划分训练集和测试集完成后,就可以从 Scikit-Learn 库中引入XGBRegressor() 模型进行模型 训练了,代码如下:from xgboost import XGBRegressor
model=XGBRegressor()# 使用默认参数
model.fit(X_train,y_train)
#模型搭建完毕后,通过如下代码预测测试集数据:
y_pred=model.predict(X_test)
#print(y_pred[0:134])
model.score(X_test,y_test)
features=X.columns # 获取特征名称
importances=model.feature_importances_# 获取特征重要性
#通过二维表格形式显示
importances_df=pd.DataFrame()
importances_df['特征名称]=features
importances_df['特征重要性]=importances
importances_df.sort_values('特征重要性',ascending=False)
#我们可以对XGBoost 回归模型进行参数调优,代码如下:
from sklearn.model_selection import GridSearchCV
parameters={'max_depth':[1,3,5,7,9],'n_estimators':[20,50,100,150,200],'learning rate':[0.01,
0.05,0.1,0.2,0.3,0.5]}#指定模型中参数的范围
clf=XGBRegressor()# 构建回归模型
grid_search=GridSearchCV(model,parameters,scoring=r2',cv=5)
grid_search.fit(X_train,y_train)# 传入数据
grid_search.best_params_# 输出参数的最优值 #在模型中设置参数,代码如下:
model=XGBRegressor(max_depth=3,n_estimators=150,learning_rate=0.2)
model.fit(X_train,y_train)
#此时再通过r2_score()函数进行模型评估,代码如下(也可以用model.score(X_test,y_test) 进 行评分,效果一样):
from sklearn.metrics import r2_score
r2=r2_score(y_test,model.predict(X_test))
print(r2)
features=X.columns # 获取特征名称
importances=model.feature_importances_# 获取特征重要性#通过二维表格形式显示
importances_df=pd.DataFrame(
importances_df[ '特征名称']=features
importances_df[ '特征重要性]=importances
importances_df.sort_values( '特征重要性',ascending=False)
#SHAP
import shap shap.initjs()
explainer =shap.TreeExplainer(model)
shap_values =explainer.shap_values(X_test)
shap_values_obj=explainer(X_test)
plt.rcParams['font.sans-serif]=['SimHei]# 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
features=X[:134]# 只分析前20个样本
shap_values =explainer.shap_values(features)
shap.decision_plot(explainerexpected_value,shap_values,
features,feature_order='hclust')
shap.plots.heatmap(shap_values_obj)
clustering=shap.utils.hclust(X,y) shap.plots.bar(shap_values_obj,
clustering=clustering,
clustering_cutoff=0.5)
shap_interaction_values =explainer.shap_interaction_values(X_test)
shap.summary_plot(shap_interaction_values,X_test)
#shap_interaction_values=explainer.shap_interaction_values(X_test)
shap.summary_plot(shap_interaction_values,X_test,max_display=10,plot_type="compact_dot")
I
求排列熵
import numpy as np
from math import factorial
def permutation_entropy(time_series,order=7,delay=1,normalize=False):
x=np.array(time_series)#x=[4,7,9,10,6,11,3]
hashmult=np.power(order,np.arange(order))#[139]
#_embed 的作用是生成上图中重构后的矩阵
#argsort 的作用是对下标排序,排序的标准是值的大小
#比如第3行[9,10,6]9的下标是06是2...,6最小
#所以排序后向量的第一个元素是6的下标2.排完[201]
sorted_idx=_embed(x,order=order,delay=delay).argsort(kind='quicksort')
#np.multiply 对应位置相乘 hashmult是 1 3 9 sum 是求每一行的和
#hashmult一定要保证三个一样的值顺序不同按位乘起来后每一行加起来大小不同类 似赋一个权重
hashval=(np.multiply(sorted_idx,hashmult)).sum(1)#[2121111911]
#Return the counts
_,c=np.unique(hashval,return_counts=True)# 重小到大每个数字出现的次数#c 是[21
2]最小的11出现了2次191次
p=np.true_divide(c,c.sum()#[0.40.20.4]2/5=0.4
pe=-np.multiply(p,np.log2(p)).sum()#根据公式
if normalize:#如果需要归一化
pe/=np.log2(factorial(order) return pe
[25]
#将一维时间序列,生成矩阵 def_embed(x,order=7,delay=1):
"""Time-delay embedding.Parameters
x:1d-array,shape(n_times)
Time series
order:int
Embedding dimension(order) delay :int
Delay.
Returns
-------
embedded:ndarray,shape(n_times-(order-1)*delay,order)
Embedded time-series.
"""
N=len(x)
Y=np.empty(order,N-(order-1)*delay)
for i in range(order):
Y[i]=x[i*delay:i*delay+Y.shape[1]] return Y.T
import numpy as np
import matplotlib.pyplot as plt import numpy as np
import pandas as pd
if_name_='_main_:
#x=pd.read_csv (总变量(无AQI 等级).csv',encoding='gb2312')
#x.drop(['date'],axis=1,inplace=True) for i in df.columns:
sl=df[i]
fe=permutation_entropy(s1,2)
features=df.columns
features_df=pd.DataFrame()
features_df['特征名称']=features
features_df[特征重要性]=fe
print(importances_df)# 将日期转换为数字(便于插值计算),采用时间戳方式
x_numbers = [datetime.datetime.strptime(date_str, '%Y-%m-%d').timestamp() for date_str in new_date_list]
y_numbers = value_list
# 进行插值计算(线性插值示例,可根据需求调整为三次样条等)
linear_interp = interpolate.interp1d(x_numbers, y_numbers, kind='linear')
new_y_numbers = linear_interp(x_numbers)
# 整理插值后的数据
result_dict = {}
for date_str, value in zip(new_date_list, new_y_numbers):
result_dict[date_str] = value
result_df = pd.DataFrame.from_dict(result_dict, orient='index', columns=['ecpm_tomorrow'])
result_df.reset_index(inplace=True)
result_df.rename(columns={'index': 'date'}, inplace=True)
return result_df
def standardize_data(df):
"""
对数据进行标准化处理
:param df: 原始数据DataFrame
:return: 标准化后的数据DataFrame
"""
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df)
return pd.DataFrame(standardized_data, columns=df.columns)
def train_xgb_model(X_train, y_train, X_test, y_test):
"""
训练XGBoost回归模型并评估
:param X_train: 训练特征数据
:param y_train: 训练目标数据
:param X_test: 测试特征数据
:param y_test: 测试目标数据
:return: 训练好的模型,模型评估得分
"""
model = xgb.XGBRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = r2_score(y_test, y_pred)
return model, score
def tune_xgb_model(X_train, y_train, X_test, y_test):
"""
对XGBoost模型进行参数调优
:param X_train: 训练特征数据
:param y_train: 训练目标数据
:param X_test: 测试特征数据
:param y_test: 测试目标数据
:return: 调优后的模型,最佳参数,模型评估得分
"""
parameters = {
'max_depth': [3, 5, 7, 9],
'n_estimators': [50, 100, 150, 200],
'learning_rate': [0.01, 0.05, 0.1, 0.2]
}
model = xgb.XGBRegressor()
grid_search = GridSearchCV(model, parameters, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred = best_model.predict(X_test)
score = r2_score(y_test, y_pred)
return best_model, best_params, score
def analyze_shap(model, X_test):
"""
使用SHAP进行模型解释分析
:param model: 训练好的模型
:param X_test: 测试特征数据
"""
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# 绘制决策图(展示特征对预测结果的影响)
shap.decision_plot(explainer.expected_value, shap_values, feature_names=X_test.columns)
# 绘制特征重要性热力图
shap.plots.heatmap(shap_values, feature_names=X_test.columns)
# 绘制特征交互作用图
shap_interaction_values = explainer.shap_interaction_values(X_test)
shap.summary_plot(shap_interaction_values, X_test)
if __name__ == "__main__":
# 读取数据
pollutant_df = read_pollutant_data()
meteorological_df = read_meteorological_data()
# 数据处理(插值等,根据实际需求选择调用)
# 假设这里对pollutant_df进行插值处理
interpolated_pollutant_df = interpolate_time_series(pollutant_df)
# 数据标准化
standardized_pollutant_df = standardize_data(interpolated_pollutant_df)
# 划分训练集和测试集
X = standardized_pollutant_df.drop('PM2.5', axis=1)
y = standardized_pollutant_df['PM2.5']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 模型训练与评估
model, score = train_xgb_model(X_train, y_train, X_test, y_test)
print(f"初始模型R2得分: {score}")
# 模型参数调优
tuned_model, best_params, tuned_score = tune_xgb_model(X_train, y_train, X_test, y_test)
print(f"调优后模型R2得分: {tuned_score},最佳参数: {best_params}")
# SHAP分析
analyze_shap(tuned_model, X_test)
将以上程序进行修改完善,并将修改后的程序结合以上程序完整发出来