# -*- coding: utf-8 -*-
"""
Created on Fri Dec 2 13:46:48 2022
@author: Lenovo
"""
from sklearn.metrics import make_scorer
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score,mean_squared_error
# from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy.stats import gaussian_kde
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn.feature_selection import RFECV
from scipy.interpolate import griddata
from itertools import combinations
from operator import itemgetter
dic = {}
path=r'D:\Fluxnet\try'
outpath=r'D:\Fluxnet\OUTCOME\每种变量组合放在一起之前的仓库'
site_list=[]
year_list=[]
total_number=[]
post_dropna_number=[]
post_drop_le_abnormal_number=[]
test_number=[]
train_number=[]
N_estimators=[]
Max_depth=[]
Rmse_list=[]
R2_list=[]
Bias_list=[]
Drivers_column=[]
Filling_rate_list=[]
Feature_list=[]
# path1=r'D:\Fluxnet\try'
# path2=r'D:\Fluxnet\try_ndvi'
# path1=r'D:\Fluxnet\加了土壤水和土壤温度的\MDS_用'
# path2=r'D:\Fluxnet\ndvi777 - SHAOSHAOSHAO'
# for s,j in zip(os.listdir(path1),os.listdir(path2)):
# print(s)
# print(os.listdir(path2))
# sole_s=pd.read_csv(os.path.join(path1,s))
# sole_j=pd.read_csv(os.path.join(path2,j))
# sole_s['TIMESTAMP_START']=sole_s['TIMESTAMP_START'].astype('str')
# sole_s['TIMESTAMP_START']=pd.to_datetime(sole_s['TIMESTAMP_START'])
# sole_j=sole_j[['TIMESTAMP_START','NDVI']]
# sole_j['TIMESTAMP_START'] = pd.to_datetime(sole_j['TIMESTAMP_START'])
# sole_j = sole_j.set_index('TIMESTAMP_START')
# sole_j = sole_j.resample('1D').interpolate() # 30T 按分钟(T)插值 1D按天插值
# sole_j = sole_j.reset_index()
# sole=pd.merge(sole_s, sole_j,how='left',on='TIMESTAMP_START')
# sole['NDVI']=sole['NDVI'].interpolate(method='pad') # 1天一个值
# print(sole)
path1 = r'C:\Users\Lenovo\Desktop\四大类\REALTRY'
for file in os.listdir(path1):
sole = pd.read_csv(os.path.join(path1,file))
site_list1=[]
year_list1=[]
test_number1=[]
train_number1=[]
rmse_list1=[]
r2_list1=[]
bias_list1=[]
sole_raw = sole
sole_copy = sole
print('原始数据:',sole.shape)
sole.dropna(subset=['LE_F_MDS_QC'],axis=0,inplace=True) #删除LE_F_MDS_QC中含有空值的行
print('去掉没QC后的原始数据:',sole.shape)
trainset=sole[sole['LE_F_MDS_QC']==0]
print('观测数据:',trainset.shape)
# =============================================================================
# 以SW_IN=20W/m²为界 白天和晚上分别训练
# =============================================================================
trainset=trainset[trainset['SW_IN']>=20]
print('白天的总量: ',trainset.shape)
gap=sole[sole['LE_F_MDS_QC']!=0]
print('插补数据:',gap.shape)
gap_drople=gap.drop(['LE_F_MDS','LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
# , 'SW_IN_F_MDS_QC', 'NETRAD'
,axis=1)
# gap_drople=gap_drop.drop(['SW_IN_F_MDS_QC', 'NETRAD'],axis=1)
#===============================每行至少有一个/三个不是空值时保留20
gap_dropna=gap_drople
# gap_dropna=gap_drople.dropna(axis=0,thresh=3)
print('去空值后的插补数据:',gap_dropna.shape)
dff=pd.DataFrame(gap_dropna.isna().sum().sort_values(ascending=False))
print('预测集的空值:',dff)
#看下训练集的空值,可以看出跟插补集不太一样
print('训练集的空值:\n',trainset.drop(['LE_F_MDS','LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
,axis=1).isna().sum().sort_values(ascending=False))
#==========================获得所有变量组合
def combine(list0,o):
list1=[]
for i in combinations(list0,o):
list1.append(i)
return list1
tianchongliang=[]
chabuliang=[]
rmseliang=[]
site_list=[]
ALL_rmse_list=[]
rmse_number=[]
pinjie_number=[]
train_number=[]
rmse_list=[]
rmse1_list=[]
all_rmse1_list=[]
r2_list=[]
r21_list=[]
ALL_r2_list=[]
all_r21_list=[]
bias_list=[]
bias1_list=[]
bianliang=[]
ALL_bias_list=[]
all_bias1_list=[]
filling_rate_list=[]
dic_list = []
N_estimators = []
Max_depth =[]
fig = plt.figure(figsize=(4,40),dpi=600)
fig1 = plt.figure(figsize=(16,36),dpi=600)
ALL_x_test = pd.Series()
ALL_y_test = pd.Series()
qian=0
hou=-1
countt=0
for u in reversed(range(3,len(gap_drople.columns)+1)) :
fillrate_mid_list=[]
col_list=[]
list666=[]
list666.extend(combine(dff.index,u))
#===========================获取不同插补率的组合特征20
list_score=[]
score=[]
big_list=[]
for i in range(0,len(list666)):
sco=f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}'
score+=[f'{gap_drople[list(list666[i])].dropna().shape[0] / gap_drople.shape[0]:.2f}']
list_score+=[{'score':sco,'list':list666[i]}]
# print(list_score) print(list_score)
#=============================plot
key_list=[a['list'] for a in list_score]
len_list = [ len(i) for i in key_list ]
score=[np.float64(i) for i in score]
plt.rc('font', family='Times New Roman',size=20)
plt.figure(figsize=(10,8),dpi=400)
plt.scatter(len_list,score)
plt.xlabel('Number of drivers', {'family':'Times New Roman','weight':'normal','size':20})
plt.ylabel('Filling rate',{'family':'Times New Roman','weight':'normal','size':20})
#============================填充率最大对应去的变量列表shunxu20
sorted_list=sorted(list_score, key=lambda list_score: list_score['score'], reverse=True)
# print(sorted_list) # 按降序排列
biggest_score=[a['score'] for a in sorted_list][0]
biggest_score_feature_list=[a['list'] for a in sorted_list][0]
# print(biggest_score_feature_list)
Feature_list.append(biggest_score_feature_list)
filling_rate_list.append(biggest_score)
Filling_rate_list.append(biggest_score)
#==============================建模准备================================
train_copy=trainset.copy()
train_copy.drop(['LE_F_MDS_QC','TIMESTAMP_START','TIMESTAMP_END']#
,axis=1,inplace=True)#.isna().sum().sort_values(ascending=False)
feature=[x for x in biggest_score_feature_list]
train_option=train_copy[feature]
train_option['LE_F_MDS']=train_copy['LE_F_MDS']
print("Train_option原始数值\n",train_option.shape)#
print(train_option.isna().sum().sort_values(ascending=True))
#============================去除空值=======================================
train_option_dropna=train_option.dropna() #训练数据去空值
print('训练集去掉空值后: ',train_option_dropna.shape)
c=train_option_dropna
print(c.shape)
Drivers=c.drop(['LE_F_MDS'],axis=1)
Drivers_column+=[' '.join(Drivers.columns.tolist())]
LE=c['LE_F_MDS']
x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
,test_size=0.20
,random_state=(0))
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)
# =============================================================================
# 建模
# =============================================================================
# def simpleGridSearch(x_train, x_test, y_train, y_test):
# best_score = 100
# rmse = []
# for n_esti in [1100,1800,2500]:
# for max_dep in [110]:
# rf = RandomForestRegressor(n_estimators=n_esti
# ,max_depth=max_dep
# ,oob_score=True
# ,random_state=(0)) # 对于每种参数可能的组合,进行一次训练;
# rf.fit(x_train,y_train)
# score = np.sqrt(mean_squared_error(y_train, rf.oob_prediction_))
# print(score)
# rmse+=[score]
# if score < best_score:#找到表现最好的参数
# n_estimators = n_esti
# max_depth = max_dep
# best_score = score
# print("Best score:{:.2f}".format(best_score))
# return n_estimators,max_depth
rf=RandomForestRegressor(n_estimators=1100
,max_depth=800
,oob_score=True
,random_state=(0))
rf.fit(x_train,y_train)
# rf.fit(Drivers,LE)
# pred_oob = rf.oob_prediction_ #袋外预测值
# print(len(pred_oob))
# print(pred_oob)
# rmse=np.sqrt(mean_squared_error(LE, pred_oob)) #袋外均方根误差
site_list+=[file.split('_',6)[1]]
tianchongliang+=[biggest_score]
chabuliang+=[gap.shape[0]]
rmseliang+=[len(y_test)]
rmse=np.sqrt(mean_squared_error(y_test,rf.predict(x_test)))
rmse_list.append(rmse)
r2=r2_score(y_test,rf.predict(x_test))
r2_list.append(r2)
bias=(rf.predict(x_test)-y_test).mean() # bias=(pred_oob-LE).mean()
bias_list.append(bias)
# rmse_df=pd.DataFrame({'site':site_list,'rmse':rmse_list
# ,'rmse量':rmseliang,'插补量':chabuliang
# ,'插补率':tianchongliang})
# rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(file.split('_',6)[1]) +'.csv'),index = False)
x_test6 = pd.Series(rf.predict(x_test),index=y_test.index)
# =============================================================================
# 单一变量组合线性内插
# =============================================================================
s_ori = pd.read_csv(os.path.join(path1,file))
s_ori.loc[:,'LE'] = y_test
s_ori.loc[y_test.index,'LE_F_MDS'] = np.nan
s_ori['LE_F_MDS']= s_ori['LE_F_MDS'].interpolate()
rmse1=np.sqrt(mean_squared_error(y_test,s_ori.loc[y_test.index,'LE_F_MDS'] ))
rmse1_list.append(rmse1)
r21=r2_score(y_test,s_ori.loc[y_test.index,'LE_F_MDS'])
r21_list.append(r21)
bias1=(s_ori.loc[y_test.index,'LE_F_MDS']-y_test).mean()
bias1_list.append(bias1)
bianliang.append(u)\
# N_estimators.append(n_estimators)
# Max_depth.append(max_depth)
rmse_df=pd.DataFrame({'site':site_list,'RF_RMSE':rmse_list
,'IP_RMSE':rmse1_list
,'rmse量':rmseliang,'插补量':chabuliang
,'插补率':tianchongliang
,'RF_R2':r2_list,'IP_R2':r21_list
,'RF_BIAS':bias_list,'IP_BIAS':bias1_list
,'变量个数':bianliang
,'index':bianliang
})#,'n_estimators':n_estimators ,'max_depth':max_depth
print(rmse_df)
rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(file.split('_',6)[1]) +'.csv'),index = False)
# =============================================================================
# DYNAMIC RMSE
# =============================================================================
#==============================复制一下整个的 插补 保存 比较 导出ALL_y_test
dy = pd.read_csv(os.path.join(path1,file))
dy = dy[feature]
dy = dy.dropna()
dy.loc[:, 'dynamic'] = y_test
dy.loc[:, 'dynamicc'] = x_test6
lee=sole.copy()
lee['LE_F_MDS_QC'].replace(-9999, np.nan, inplace=True)
lee['LE_F_MDS_QC'].replace([1,2,3], np.nan, inplace=True)
lee['LE_F_MDS_QC'].replace(0, np.nan, inplace=True)
lee['LE_F_MDS_QC'].fillna(dy['dynamic'], inplace=True)
lee['RMSE']=[rmse]*sole.shape[0]
lee['x_test666'] = np.nan
lee['x_test666'].fillna(dy['dynamicc'], inplace=True)
dic0={'TIMESTAMP_START':lee['TIMESTAMP_START'].tolist()
,'TIMESTAMP_END':lee['TIMESTAMP_END'].tolist()
,'dynamic'+ str(u): lee['LE_F_MDS_QC'].tolist()
,'dynamicc'+ str(u): lee['x_test666'].tolist()
,'RMSE'+ str(u): lee['RMSE']
,'Drivers'+ str(u): [' '.join(Drivers.columns.tolist())]*sole.shape[0]
}
df0 = pd.DataFrame(dic0)
dic={'list_name':df0, 'rmse':df0['RMSE'+ str(u)][df0.index[0]]}
dic_list += [dic]
sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False)
list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe count
df = pd.concat(list_name,axis=1)
df = df.loc[:,~df.columns.duplicated()]
print(df)
shunxu = [''.join(list(filter(str.isdigit,x))) for x in df.columns]
shunxu0 = list(filter(None,shunxu))
shunxu = list(set(shunxu0)) #set的方法会改变顺序 按照原来的index排个序
shunxu.sort(key = shunxu0.index)
print(shunxu)
# ===========================高斯核密度散点图===========================
# post_gs=pd.DataFrame({'predict':pred_oob,'in_situ':LE,})
post_gs=pd.DataFrame({'predict':rf.predict(x_test),'in_situ':y_test,})
post_gs['index']=[i for i in range(post_gs.shape[0])]
post_gs=post_gs.set_index('index')
x=post_gs['in_situ']
y=post_gs['predict']
xy = np.vstack([x,y])#计算点密度
z = gaussian_kde(xy)(xy)#高斯核密度
idx = z.argsort()#根据密度对点进行排序,最密集的点在最后绘制
x, y, z = x[idx], y[idx], z[idx]
fw = 800
ax = fig.add_subplot(len(gap_drople.columns)-1,1,u-2)
scatter = ax.scatter(x,y,marker='o',c=z,s=15,label='LST',cmap='jet') # o是实心圆,c=是设置点的颜色,cmap设置色彩范围,'Spectral_r'和'Spectral'色彩映射相反
divider = make_axes_locatable(ax) #画色域图
# plt.scatter(x, y, c=z, s=7, cmap='jet')
# plt.axis([0, fw, 0, fw]) # 设置线的范围
# plt.title( file.split('_',6)[1], family = 'Times New Roman',size=21)
# plt.text( 10, 700,len(feature), family = 'Times New Roman',size=21)
# plt.text(10, 700, 'Driver numbers = %s' % len(feature), family = 'Times New Roman',size=21)
# plt.text(10, 600, 'Size = %.f' % len(y), family = 'Times New Roman',size=18) # text的位置需要根据x,y的大小范围进行调整。
# plt.text(10, 650, 'RMSE = %.3f W/m²' % rmse, family = 'Times New Roman',size=18)
# plt.text(10, 700, 'R² = %.3f' % r2, family = 'Times New Roman',size=18)
# plt.text(10, 750, 'BIAS = %.3f W/m²' % bias, family = 'Times New Roman',size=18)
ax.set_xlabel('Station LE (W/m²)',family = 'Times New Roman',size=19)
ax.set_ylabel('Estimated LE (W/m²)',family = 'Times New Roman',size=19)
ax.plot([0,fw], [0,fw], 'gray', lw=2) # 画的1:1线,线的颜色为black,线宽为0.8
ax.set_xlim(0,fw)
ax.set_ylim(0,fw)
# ax.xaxis.set_tick_params(labelsize=19)
# ax.xaxis.set_tick_params(labelsize=19)
# plt.xticks(fontproperties='Times New Roman',size=19)
# plt.yticks(fontproperties='Times New Roman',size=19)
fig.set_tight_layout(True)
#================================================================MDS
s_ori = pd.read_csv(os.path.join(path1,file))
print('mds的空值:\n',s_ori.drop(['LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
,axis=1).isna().sum().sort_values(ascending=False))
s_ori.loc[s_ori['LE_F_MDS_QC'] !=0 , ['LE_F_MDS']] = np.nan
print('=====================================================')
print('mds的空值:\n',s_ori.drop(['LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
,axis=1).isna().sum().sort_values(ascending=False))
MDS_GAP=s_ori
if 'SW_IN' in MDS_GAP.columns.to_list() and 'TA' in MDS_GAP.columns.to_list() and 'VPD' in MDS_GAP.columns.to_list():
MDS_GAP['Year']=MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
MDS_GAP['Year'] = MDS_GAP['TIMESTAMP_END'].dt.year #Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
MDS_GAP['DoY']=MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
doy=[]
for k in MDS_GAP['TIMESTAMP_END']:
doy += [k.strftime("%j")]
MDS_GAP['DoY'] = doy #Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
MDS_GAP['Hour'] = MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
hour=[]
for l in MDS_GAP['TIMESTAMP_END']:
hour += [int(l.strftime('%H'))+float(l.strftime('%M'))/60]
MDS_GAP['Hour'] = hour
MDS_GAP.loc[:,'LE'] = y_test
print(MDS_GAP['LE'].dropna().sum())
MDS_GAP['LE'].to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) + str(u)+ '.txt'),sep=' ',index = False)
MDS_GAP['LE_F_MDS']=s_ori['LE_F_MDS']
MDS_GAP.loc[MDS_GAP['LE']>=-9999,['LE']] = -9999
MDS_GAP['LE'].fillna(MDS_GAP['LE_F_MDS'],inplace=True)
MDS_GAP['Rg']=MDS_GAP['SW_IN']
MDS_GAP['Tair']=MDS_GAP['TA']
MDS_GAP['VPD']=MDS_GAP['VPD']
# MDS_GAP['NEE']=MDS_GAP['NEE_VUT_REF']
MDS_GAP=MDS_GAP[['Year','DoY','Hour','LE','Rg','Tair','VPD']]#,'Tsoil','rH',
# MDS_GAP.loc[MDS_GAP['Rg'] > 1200 , ['Rg']] = -9999 # Drivers control Rg <= 1200W/m² Ta <= 2.5℃W/m² VPD <= 50hPa
# MDS_GAP.loc[MDS_GAP['Tair'] > 2.5 , ['Tair']] ==-9999
# MDS_GAP.loc[MDS_GAP['VPD'] > 50 , ['VPD']] = -9999
#将单位插到第零行的位置上r
row = 0 # 插入的位置
value = pd.DataFrame([['-', '-', '-','Wm-2', 'Wm-2', 'degC','hPa']],columns=MDS_GAP.columns) # 插入的数据 'degC','%',
df_tmp1 = MDS_GAP[:row]
df_tmp2 = MDS_GAP[row:]
# 插入合并数据表
MDS_GAP = df_tmp1.append(value).append(df_tmp2)
MDS_GAP = MDS_GAP.fillna(-9999)
MDS_GAP.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\MDS_TRY666', str(file.split('_',6)[1]) + str(u) + '.txt'),sep=' ',index = False)#+ str(gaplong)
#==============================复制一下整个的 插补 保存 比较 导出ALL_y_test
# gap_dropna_copy=gap_dropna.copy()
# gap_dropna_copy=gap_dropna_copy[feature]
# gap_dropna_copy=gap_dropna_copy.dropna()
# gap_dropna_copy.loc[:, 'LE_gap_filled'] = rf.predict(gap_dropna_copy)
# le=sole.copy()
# le['LE_F_MDS_QC'].replace([1,2,3], np.nan, inplace=True)
# le['LE_F_MDS_QC'].replace(0, -9999, inplace=True)
# le['LE_F_MDS_QC'].fillna(gap_dropna_copy['LE_gap_filled'], inplace=True)
# le['RMSE']=[rmse]*sole.shape[0]
# dic0={'TIMESTAMP_START':le['TIMESTAMP_START'].tolist()
# ,'TIMESTAMP_END':le['TIMESTAMP_END'].tolist()
# ,'LE_Gap_filled'+ str(u): le['LE_F_MDS_QC'].tolist()
# ,'RMSE'+ str(u): le['RMSE']
# ,'Drivers'+ str(u): [' '.join(Drivers.columns.tolist())]*sole.shape[0]
# }
# df0 = pd.DataFrame(dic0)
# dic={'list_name':df0, 'rmse':df0['RMSE'+ str(u)][df0.index[0]]}
# dic_list += [dic]
# sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False)
# list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe count
# df = pd.concat(list_name,axis=1)
# df = df.loc[:,~df.columns.duplicated()]
# shunxu = [''.join(list(filter(str.isdigit,x))) for x in df.columns]
# shunxu0 = list(filter(None,shunxu))
# shunxu = list(set(shunxu0)) #set的方法会改变顺序 按照原来的index排个序
# shunxu.sort(key = shunxu0.index)
print(shunxu)
print(df)
for latter in shunxu:
rmse_df['median']=shunxu
rmse_df = rmse_df.set_index('median')
print(latter)
rmse_df.sort_values(by="RF_RMSE" , inplace=True, ascending=True)
print(rmse_df['插补率'].max())
print(rmse_df['插补率'][latter])
if rmse_df['插补率'][latter] < rmse_df['插补率'].max():
if countt > 5:
break
print(rmse_df['插补率'])
a = df
y_test6 = a['dynamic'+ str(latter)]
x_test6 = a['dynamicc'+ str(latter)]
# a['dynamic'+str(shunxu[0])].fillna(a['dynamic'+ str(latter)], inplace=True) # LE Update
# df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(rmse_list[qian] , rmse_list[hou])
# print(tianchongliang[qian] , tianchongliang[hou])
# if qian==0 or rmse_list[qian] < rmse_list[hou] or tianchongliang[qian] > tianchongliang[hou] :
y_test6 = y_test6[~y_test6.index.isin(ALL_y_test.index)] #在y_test里不在大的合集里
x_test6 = x_test6[~x_test6.index.isin(ALL_x_test.index)] #在y_test里不在大的合集里
ALL_y_test = pd.concat([ALL_y_test, y_test6], axis=0, ignore_index=False)
ALL_x_test = pd.concat([ALL_x_test, x_test6], axis=0,ignore_index=False)
print('拼接后\n',ALL_x_test)
print('拼接后\n',ALL_y_test)
ALL_x_test666 = ALL_x_test
ALL_y_test = ALL_y_test.dropna()
ALL_x_test = ALL_x_test.dropna()
ALL_rmse=np.sqrt(mean_squared_error(ALL_y_test,ALL_x_test))
ALL_rmse_list.append(ALL_rmse)
r2=r2_score(ALL_y_test,ALL_x_test)
ALL_r2_list.append(r2)
bias=(ALL_x_test-ALL_y_test).mean() # bias=(pred_oob-LE).mean()
ALL_bias_list.append(bias)
#线性内插综合RMSE
s_orii = pd.read_csv(os.path.join(path1,file))
s_oriii = pd.read_csv(os.path.join(path1,file))
s_orii.loc[:,'LE'] = ALL_y_test
s_orii.loc[ALL_y_test.index,'LE_F_MDS'] = np.nan
s_orii['LE_F_MDS']= s_orii['LE_F_MDS'].interpolate()
rmse1=np.sqrt(mean_squared_error(ALL_y_test,s_orii.loc[ALL_y_test.index,'LE_F_MDS'] ))
all_rmse1_list.append(rmse1)
r21=r2_score(ALL_y_test,s_orii.loc[ALL_y_test.index,'LE_F_MDS'])
all_r21_list.append(r21)
bias1=(s_orii.loc[ALL_y_test.index,'LE_F_MDS'] - ALL_y_test).mean()
all_bias1_list.append(bias1)
pinjie_number.append(len(y_test6))
rmse_number.append(len(ALL_y_test))
train_number.append(int(trainset.shape[0]*0.2))
ALL_rmse_df=pd.DataFrame({'RF_RMSE':ALL_rmse_list,'IP_RMSE':all_rmse1_list
,'rmse_number':rmse_number
,'pinjie_number':pinjie_number
,'train_number':train_number
,'RF_R2':ALL_r2_list,'IP_R2':all_r21_list
,'RF_BIAS':ALL_bias_list,'IP_BIAS':all_bias1_list
})
print(ALL_rmse_df)
ALL_rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE_ALL',str(file.split('_',6)[1]) +'.csv'),index = False)
if countt==0 and rmse_df['插补率'][latter] >= rmse_df['插补率'].max() :
print('again')
print(countt)
countt+=100
a = df
y_test6 = a['dynamic'+ str(latter)]
x_test6 = a['dynamicc'+ str(latter)]
# a['dynamic'+str(shunxu[0])].fillna(a['dynamic'+ str(latter)], inplace=True) # LE Update
# df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(rmse_list[qian] , rmse_list[hou])
# print(tianchongliang[qian] , tianchongliang[hou])
# if qian==0 or rmse_list[qian] < rmse_list[hou] or tianchongliang[qian] > tianchongliang[hou] :
y_test6 = y_test6[~y_test6.index.isin(ALL_y_test.index)] #在y_test里不在大的合集里
x_test6 = x_test6[~x_test6.index.isin(ALL_x_test.index)] #在y_test里不在大的合集里
ALL_y_test = pd.concat([ALL_y_test, y_test6], axis=0, ignore_index=False)
ALL_x_test = pd.concat([ALL_x_test, x_test6], axis=0,ignore_index=False)
print('拼接后\n',ALL_x_test)
print('拼接后\n',ALL_y_test)
ALL_x_test666 = ALL_x_test
ALL_y_test = ALL_y_test.dropna()
ALL_x_test = ALL_x_test.dropna()
ALL_rmse=np.sqrt(mean_squared_error(ALL_y_test,ALL_x_test))
ALL_rmse_list.append(ALL_rmse)
r2=r2_score(ALL_y_test,ALL_x_test)
ALL_r2_list.append(r2)
bias=(ALL_x_test-ALL_y_test).mean() # bias=(pred_oob-LE).mean()
ALL_bias_list.append(bias)
#线性内插综合RMSE
s_orii = pd.read_csv(os.path.join(path1,file))
s_oriii = pd.read_csv(os.path.join(path1,file))
s_orii.loc[:,'LE'] = ALL_y_test
s_orii.loc[ALL_y_test.index,'LE_F_MDS'] = np.nan
s_orii['LE_F_MDS']= s_orii['LE_F_MDS'].interpolate()
rmse1=np.sqrt(mean_squared_error(ALL_y_test,s_orii.loc[ALL_y_test.index,'LE_F_MDS'] ))
all_rmse1_list.append(rmse1)
r21=r2_score(ALL_y_test,s_orii.loc[ALL_y_test.index,'LE_F_MDS'])
all_r21_list.append(r21)
bias1=(s_orii.loc[ALL_y_test.index,'LE_F_MDS'] - ALL_y_test).mean()
all_bias1_list.append(bias1)
pinjie_number.append(len(y_test6))
rmse_number.append(len(ALL_y_test))
train_number.append(int(trainset.shape[0]*0.2))
ALL_rmse_df=pd.DataFrame({'RF_RMSE':ALL_rmse_list,'IP_RMSE':all_rmse1_list
,'rmse_number':rmse_number
,'pinjie_number':pinjie_number
,'train_number':train_number
,'RF_R2':ALL_r2_list,'IP_R2':all_r21_list
,'RF_BIAS':ALL_bias_list,'IP_BIAS':all_bias1_list
})
print(ALL_rmse_df)
ALL_rmse_df.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE_ALL',str(file.split('_',6)[1]) +'.csv'),index = False)
# =============================================================================
# 导出插补的内容
# =============================================================================
s_orii['Interpolate'] = s_orii.loc[ALL_y_test.index,'LE_F_MDS']
s_orii['RF']= ALL_x_test666
s_orii['LE_F_MDS'] = s_oriii['LE_F_MDS']
# s_orii = s_orii[['TIMESTAMP_START','LE_F_MDS_QC','LE','RF','Interpolate',]]
s_orii.to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\RF插补值', str(file.split('_',6)[1]) + '.csv'),index = False)
#==============================================================MDS_ALL
s_ori = pd.read_csv(os.path.join(path1,file))
print('mds的空值:\n',s_ori.drop(['LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
,axis=1).isna().sum().sort_values(ascending=False))
s_ori.loc[s_ori['LE_F_MDS_QC'] !=0 , ['LE_F_MDS']] = np.nan
print('=====================================================')
print('mds的空值:\n',s_ori.drop(['LE_F_MDS_QC'
,'TIMESTAMP_START','TIMESTAMP_END']#
,axis=1).isna().sum().sort_values(ascending=False))
MDS_GAP=s_ori
if 'SW_IN' in MDS_GAP.columns.to_list() and 'TA' in MDS_GAP.columns.to_list() and 'VPD' in MDS_GAP.columns.to_list():
MDS_GAP['Year']=MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
MDS_GAP['Year'] = MDS_GAP['TIMESTAMP_END'].dt.year #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
MDS_GAP['DoY']=MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
doy=[]
for k in MDS_GAP['TIMESTAMP_END']:
doy += [k.strftime("%j")]
MDS_GAP['DoY'] = doy #老报错 Time stamp is not equidistant (half-)hours in rows: 35040, 35088, 52560, 52608, 70080, 70128, 87600, 87648
MDS_GAP['Hour'] = MDS_GAP['TIMESTAMP_END']
MDS_GAP['TIMESTAMP_END']=MDS_GAP['TIMESTAMP_END'].astype('str')
MDS_GAP['TIMESTAMP_END']=pd.to_datetime(MDS_GAP['TIMESTAMP_END'])
hour=[]
for l in MDS_GAP['TIMESTAMP_END']:
hour += [int(l.strftime('%H'))+float(l.strftime('%M'))/60]
MDS_GAP['Hour'] = hour
MDS_GAP.loc[:,'LE'] = ALL_y_test
print(MDS_GAP['LE'].dropna().sum())
MDS_GAP['LE'].to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来ALL_rmse的原始值666', str(file.split('_',6)[1]) + '.txt'),sep=' ',index = False)
MDS_GAP['LE_F_MDS']=s_ori['LE_F_MDS']
MDS_GAP.loc[MDS_GAP['LE']>=-9999,['LE']] = -9999
MDS_GAP['LE'].fillna(MDS_GAP['LE_F_MDS'],inplace=True)
MDS_GAP['Rg']=MDS_GAP['SW_IN']
MDS_GAP['Tair']=MDS_GAP['TA']
MDS_GAP['VPD']=MDS_GAP['VPD']
# MDS_GAP['NEE']=MDS_GAP['NEE_VUT_REF']
MDS_GAP=MDS_GAP[['Year','DoY','Hour','LE','Rg','Tair','VPD']]#,'Tsoil','rH',
MDS_GAP.loc[MDS_GAP['Rg'] > 1200 , ['Rg']] = -9999 # Drivers control Rg <= 1200W/m² Ta <= 2.5℃W/m² VPD <= 50hPa
# MDS_GAP.loc[MDS_GAP['Tair'] > 2.5 , ['Tair']] ==-9999
MDS_GAP.loc[MDS_GAP['VPD'] > 50 , ['VPD']] = -9999
#将单位插到第零行的位置上r
row = 0 # 插入的位置
value = pd.DataFrame([['-', '-', '-','Wm-2', 'Wm-2', 'degC','hPa']],columns=MDS_GAP.columns) # 插入的数据 'degC','%',
df_tmp1 = MDS_GAP[:row]
df_tmp2 = MDS_GAP[row:]
# 插入合并数据表
MDS_GAP = df_tmp1.append(value).append(df_tmp2)
MDS_GAP = MDS_GAP.fillna(-9999)
MDS_GAP.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\ALL_MDS_TRY666', str(file.split('_',6)[1]) + '.txt'),sep=' ',index = False)#+ str(gaplong)
#=============================== 变量个数 VS.插补率
# fig = plt.subplot(8,5,36+dalei)
# plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\DoubleY',s.split('_',6)[1])
# , bbox_inches='tight', dpi=500)
# x = [x for x in reversed(range(3,len(gap_drople.columns)+1))] #reversed(range(len(df.index)+1),3)matplotlib does not support generators as input
# y1 = rmse_list
# y2 = filling_rate_list
# ax = fig.add_subplot(len(gap_drople.columns)-1,1,len(gap_drople.columns)-1)
# fig = plt.figure(figsize=(12,8),dpi=400)
# ax = fig.add_subplot(1,1,1)
# line1=ax.plot(x, y1,color='red',linestyle='--',marker='o',linewidth=2.5)
# ax.set_ylabel('RMSE of 25% tesing set', {'family':'Times New Roman','weight':'normal','size':21},color='red')
# ax.set_xlabel('Number of drivers',{'family':'Times New Roman','weight':'normal','size':21})
# ax.tick_params(labelsize=19)
# # ax1.set_title("")
# ax2 = ax.twinx() # this is a important function
# #ax2.set_ylim([-0.05,1.05]) # 设置y轴取值范围
# # ax2.set_yticks([0.0,0.3,0.5,0.7,0.9]) # 设置刻度范围
# # ax2.set_yticklabels([0.0,0.3,0.5,0.7,0.9]) # 设置刻度
# line2=ax2.plot(x, y2,color='blue',marker='o',linewidth=2.5)
# ax2.tick_params(labelsize=19)
# ax2.set_ylabel('Filling rate', {'family':'Times New Roman','weight':'normal','size':21},color='blue')
# # a2.invert_yaxis() #invert_yaxis()翻转纵轴,invert_xaxis()翻转横轴
# # plt.tick_params(labelsize=19)
# # plt.xticks(np.arange(5, 13, 1),fontproperties='Times New Roman',size=19)
# plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\1129',str(file.split('_',6)[1]) +'.png')
# , bbox_inches='tight', dpi=500)
# plt.show()
# =============================================================================
# 动态插补
# =============================================================================
# for latter in shunxu[1:]:
# a = df
# b=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] > -9999, ['LE_Gap_filled'+ str(shunxu[0]), 'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers'+ str(shunxu[0])]=a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == np.nan, ['Drivers'+ str(shunxu[0])]]
# a['Drivers'+ str(shunxu[0])].fillna( b['Drivers'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
# a['RMSE' + str(shunxu[0])]=a.loc[a['LE_Gap_filled'+str(shunxu[0])] == np.nan, ['RMSE' + str(shunxu[0])]]
# a['RMSE' + str(shunxu[0])].fillna( b['RMSE'+ str(shunxu[0])] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# b=a.loc[a['LE_Gap_filled'+ str(latter)] > -9999, ['LE_Gap_filled'+ str(latter), 'Drivers'+ str(latter), 'RMSE'+ str(latter)]] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers'+ str(latter)]=a.loc[a['LE_Gap_filled'+ str(latter)] == np.nan, ['Drivers'+ str(latter)]]
# a['Drivers'+ str(latter)].fillna( b['Drivers'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
# a['RMSE' + str(latter)]=a.loc[a['LE_Gap_filled'+str(latter)] == np.nan, ['RMSE' + str(latter)]]
# a['RMSE' + str(latter)].fillna( b['RMSE'+ str(latter)] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# a['LE_Gap_filled'+str(shunxu[0])].fillna(a['LE_Gap_filled'+ str(latter)], inplace=True) # LE Update
# df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(a['LE_Gap_filled'+str(shunxu[0])])
# a['Drivers'+str(shunxu[0])].fillna(a['Drivers'+ str(latter)],inplace=True) # Drivers Update
# a['RMSE'+str(shunxu[0])].fillna(a['RMSE'+ str(latter)],inplace=True) # Rmse Update
# # 加一下a的时间
# so=pd.read_csv(os.path.join(path1,file))
# so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]
# print(a['TIMESTAMP_START'])
# a.to_csv(os.path.join(r'C:\Users\Lenovo\Desktop\R\用来rmse的原始值666', str(file.split('_',6)[1]) + '.csv'),index = False)
# # print(a)
# a['QC'] = np.nan
# a.loc[a['LE_Gap_filled'+ str(shunxu[0])] != -9999, 'QC'] = 1
# a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -9999 , 'QC'] = 0
# a['LE_Gap_filled'+ str(shunxu[0])].replace(np.nan,-8888,inplace=True) # 原本是空值的部分 由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
# a['LE_Gap_filled'+ str(shunxu[0])].replace(-9999,np.nan,inplace=True) # | 空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
# a['LE_Gap_filled'+ str(shunxu[0])].fillna(so['LE_F_MDS'],inplace=True)# 最后依旧是空值
# a.loc[a['LE_Gap_filled'+ str(shunxu[0])] == -8888 , 'QC'] = -9999
# a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC', 'Drivers'+ str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
# a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')
# a['LE_Gap_filled'+ str(shunxu[0])].fillna(a['LE_F_MDS'],inplace=True)
# a['LE_Gap_filled'+ str(shunxu[0])].replace(-8888,np.nan,inplace=True)
# a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled'+ str(shunxu[0]), 'QC', 'Drivers' + str(shunxu[0]), 'RMSE'+ str(shunxu[0])]]
# bianliangmen = pd.read_csv(os.path.join(path1,file))
# bianliangmen = bianliangmen.drop(['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS'],axis=1).columns
# for i in bianliangmen:
# a[str(i)]=np.nan
# # print(a.columns) year
# a['Drivers' + str(shunxu[0])].replace(np.nan,-9999,inplace=True)
# b=a.loc[a['Drivers' + str(shunxu[0])]!=-9999]
# for i in b.columns[6:]:
# c=b[b['Drivers' + str(shunxu[0])].str.contains(i)]
# c[i].replace(np.nan,'+',inplace=True)
# a[i]=c[i]
# b=a.count(axis=1)-6
# b=pd.DataFrame(b)
# a['n_drivers']=b
# a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
# a['Drivers' + str(shunxu[0])].replace(-9999,np.nan,inplace=True)
# # print(a)
# a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(file.split('_',6)[1]) +'.csv'),index = False)
# #
# # total_number.append(int(sole.shape[0]))
# # post_dropna_number.append(int(train_option_dropna.shape[0]))
# # post_drop_le_abnormal_number.append(int(c.shape[0]))
# # test_number.append(int(c.shape[0]*0.25))
# # train_number.append(int(c.shape[0]*0.75))
# # # N_estimators.append(n_estimators)
# # # Max_depth.append(max_depth)
# # ===========================================================绘制散点图file
# s_ori = pd.read_csv(os.path.join(path1,file))
# ori = s_ori.loc[s_ori['LE_F_MDS_QC']==0,['TIMESTAMP_START','LE_F_MDS']]
# filled = s_ori.loc[s_ori['LE_F_MDS_QC']!=0,['TIMESTAMP_START','LE_F_MDS']]
# s_ori['TIMESTAMP_START'] = pd.to_datetime(s_ori['TIMESTAMP_START'])
# s_ori['year'] = s_ori['TIMESTAMP_START'].dt.year
# gap_filled = a.loc[a['QC'] == 1,['TIMESTAMP_START','LE_Gap_filled'+ str(shunxu[0])]]
# fig1 ,ax = plt.subplots(5,1,sharex='col',figsize=(25,9),dpi=300)
# ax0 = ax[0]
# ax0.plot( 'LE_F_MDS', data=ori, linestyle='none',marker='o')
# ax1 = ax[1]
# ax1.plot( 'LE_F_MDS', data=filled, color='#ff7f0e',linestyle='none', marker='o')
# ax2 = ax[2]
# ax2.plot( 'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
# ax2.plot( 'LE_F_MDS', data=filled, alpha=0.6, linestyle='none', marker='o')
# ax3 = ax[3]
# # ax2.plot( 'LE_F_MDS', data=s_ori, alpha=0.6, linestyle='none', marker='o')
# ax3.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled, color='#FAA460', linestyle='none', marker='o' )
# ax4 = ax[4]
# ax4.plot( 'LE_F_MDS', data=ori, alpha=0.6, linestyle='none', marker='o')
# ax4.plot('LE_Gap_filled'+ str(shunxu[0]),data=gap_filled,color='#FAA460', alpha=0.6, linestyle='none', marker='o' )
# ax0.set_ylabel('in-situ', fontsize=19)
# ax1.set_ylabel('MDS', fontsize=19)
# ax2.set_ylabel('FLUXNET2015', fontsize=19)
# ax3.set_ylabel('RF', fontsize=19)
# ax4.set_ylabel('Dynamic', fontsize=19)
# nianfen = int(file.split('_',6)[5].split('-',2)[0])
# nianfen1 = int(file.split('_',6)[5].split('-',2)[1])
# ax2.set_xticks([365*48*x for x in range(nianfen1+2-nianfen)])
# ax2.set_xticklabels([x for x in range(nianfen,nianfen1+2)],fontproperties='Times New Roman',size=19)
# ax4.set_xlabel('Year', fontsize=19)
# plt.savefig(os.path.join(r'D:\Fluxnet\PIC666\1128',str(file.split('_',6)[1]) +'.png')
# , bbox_inches='tight', dpi=500)
# plt.show()
#===============================================导出
# dic={'SITES':site_list,'YEAR':year_list,'原始数目':total_number
# ,'去掉空值后':post_dropna_number
# ,'去掉LE异常值后':post_drop_le_abnormal_number
# ,'TRAIN_NUMBER':train_number
# ,'TEST_NUMBER':test_number
# # ,'n_estimators':N_estimators,'max_depth':Max_depth
# ,'RMSE':Rmse_list,'R2':R2_list,'BIAS':Bias_list
# ,'Drivers_column':Drivers_column
# ,'Filling_rate' : Filling_rate_list
# }
# dic=pd.DataFrame(dic)
# # print(dic)
# dic.to_csv(r'D:\Fluxnet\OUTCOME\RMSE_ALL\RMSE_All_Day.csv')
# dic_sole={
# 'RMSE':rmse_list,'R2':r2_list,'BIAS':bias_list
# }
# dic_sole=pd.DataFrame(dic_sole)
# dic_sole.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\RMSE', str(file.split('_',6)[1]) +'.csv'),index = False)
#===============================================Various length of gap
# for j,k in zip([0.05,0.075,0.125],[6,24,48]): #一天 七天 一月 一共占总数据的0.25
# #48,336,720
# df0=sole.copy()
# print(len(df0))
# df=df0[df0['LE_F_MDS_QC']==0]
# print(df['LE_F_MDS_QC'])
# print(len(df))
# print('!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
# #可以开始make gap的位置区间
# start_point=np.arange(df['LE_F_MDS_QC'].index[0],df['LE_F_MDS_QC'].index[-1]-k+1) #k是gap长度
# #gap的个数
# gap_number=int(len(df)*j/k)
# print(gap_number)
# # 随机选择开始的位置
# # np.random.seed(1) # 每次的随机数都是一样的
# gap_posi=np.random.choice(start_point,gap_number*3) #多一点选择的余地
# posi=sorted(gap_posi) # 排一下顺序}
# print(posi)
# count=0
# gap_qujian=[]
# # 并不是每个随机开始的位置都可以用,不能和以前的gap开始的位置重叠,gap的位置数据量也要充足
# for m,n in enumerate(posi): # m是索引 n是开始的位置(其实也是索引)
# # 单个gap的区间
# # 意思是从第多少位到多少位是gap区间
# gap_danqujian_list =[h for h in np.arange(n,n+k)]
# print(gap_danqujian_list)
# print('==')
# # 整个DataFrame中的gap
# gap_df = df0.iloc[gap_danqujian_list]
# # print(gap_df)
# # gap区间要在限定的范围内
# if np.isin(gap_danqujian_list,start_point).all():
# # 不同长度gap不能重叠
# if m>0 and n-posi[m-1] <= k:
# continue
# # gap区间内要有足够的原始数据
# if len(gap_df.dropna()) / len(gap_df) < 0.5:
# continue
# gap_qujian.extend(gap_danqujian_list)
# print(gap_qujian)
# count += 1
# if count == gap_number: # 每种gap的数目都要达到gap_number,达到规定的比例才会停止
# print('@@@@@@@@@@@@@@@@@@@@@')
# print(count)
# break
# # 要去掉索引对应的le为空的suoyin
# test_df=df0.iloc[gap_qujian] # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
# print(test_df)
# print(len(test_df))
# test=test_df.loc[test_df['LE_F_MDS_QC']==0,].dropna(axis=0) # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
# print(test)
# print(len(test))
# train_index=np.setdiff1d(df0.index,test_df.index) # setdiff1d 前面那个数组有 后边那个没有的值
# print(train_index)
# train_df=df0.iloc[train_index] # # pd.iloc[[1,2,3]] 查找方括号内数字所在的行
# train=train_df.loc[train_df['LE_F_MDS_QC']==0,].dropna(axis=0)
# print(train)
# print(len(train))
# pd.set_option('display.max_columns', None)
# # print(test.head(5))
# print(train.shape)
# print(test.shape)
# a=pd.DataFrame(test.isna().sum().sort_values(ascending=False))
# # des=test.describe()
# # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
# # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
# # test=test[(test['LE_F_MDS'] <=shangxu[3])
# # &(test['LE_F_MDS'] >=xiaxu[3])]
# # print(des)
# # des=train.describe()
# # shangxu=des.loc['75%']+1.5*(des.loc['75%']-des.loc['25%'])
# # xiaxu=des.loc['25%']-1.5*(des.loc['75%']-des.loc['25%'])
# # train=train[(train['LE_F_MDS'] <=shangxu[3])
# # &(train['LE_F_MDS'] >=xiaxu[3])]
# # print(xiaxu)
# train=train.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
# test=test.drop(['TIMESTAMP_START','TIMESTAMP_END','LE_F_MDS_QC'],axis=1)
# # train_Drivers=train.drop(['LE_F_MDS'],axis=1)
# train_Drivers=train[feature]
# print(train_Drivers.index)
# # test_Drivers=test.drop(['LE_F_MDS'],axis=1)
# test_Drivers=test[feature]
# print(test_Drivers.index)
# train_LE=train['LE_F_MDS']
# print(train_LE.index)
# test_LE=test['LE_F_MDS']
# print(test_LE.index)
# # x_train,x_test,y_train,y_test=train_test_split(Drivers,LE
# # ,test_size=0.25
# # ,random_state=(0))
# print(train_Drivers.shape)
# print(test_Drivers.shape)
# print(train_LE.shape)
# print(test_LE.shape)
# # ==============================建模====================================
# rf1=RandomForestRegressor(n_estimators=1100
# ,max_depth=80
# ,random_state=(0))
# rf1.fit(train_Drivers,train_LE)
# rmse1=np.sqrt(mean_squared_error(test_LE,rf1.predict(test_Drivers)))
# rmse_list1.append(rmse1)
# rmse_df=pd.DataFrame({'rmse':rmse_list1})
# print(rmse_df)
# r2=r2_score(test_LE,rf1.predict(test_Drivers))
# r2_list1.append(r2)
# r2_df=pd.DataFrame({'r2':r2_list1})
# bias=(rf1.predict(test_Drivers)-test_LE).mean()
# bias_list1.append(bias)
# bias_df=pd.DataFrame({'bias':bias_list1})
# site_list1+=[s.split('_',6)[1]]
# year_list1+=[int(s.split('_',6)[5].split('-',1)[1])
# -int(s.split('_',6)[5].split('-',1)[0])+1]
# # total_number.append(int(b.shape[0]))
# # post_dropna_number.append(int(a.shape[0]))
# # post_drop_le_abnormal_number.append(int(c.shape[0]))
# test_number1.append(int(test.shape[0]))
# train_number1.append(int(train.shape[0]))
# dic2={'SITES':site_list1,'YEAR':year_list1
# # ,'原始数目':total_number
# # ,'去掉空值后':post_dropna_number
# # ,'去掉LE异常值后':post_drop_le_abnormal_number
# ,'TRAIN_NUMBER':train_number1
# ,'TEST_NUMBER':test_number1
# # ,'n_estimators':N_estimators,'max_depth':Max_depth
# ,'RMSE':rmse_list1,'R2':r2_list1,'BIAS':bias_list1
# }
# dic2=pd.DataFrame(dic2)
# print(dic2)
# dic2.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\GAP_diff', str(s.split('_',6)[1]) + '.csv'),index = False)
#========================================读一下八个csv
# dic_list=[]
# for i in range(3,5):
# df=pd.read_csv(os.path.join(outpath,str(s.split('_',6)[1]) + str(i) + '.csv'))
# dic={'list_name':df, 'rmse':df['RMSE'][0]}
# dic_list+=[dic]
# print(dic_list)
# print('=============================================')
# # df3=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '3' +'.csv'))
# # df4=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '4' +'.csv'))
# # df5=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '5' +'.csv'))
# # df6=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '6' +'.csv'))
# # df7=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '7' +'.csv'))
# # df8=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '8' +'.csv'))
# # df9=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '9' +'.csv'))
# # df10=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '10' +'.csv'))
# # df11=pd.read_csv(os.path.join(r'D:\Fluxnet\OUTCOME',str(s.split('_',6)[1]) + '11' +'.csv'))
# # dic=[{'list_name':df3, 'rmse':df3['RMSE'][0]}
# # ,{'list_name':df4, 'rmse':df4['RMSE'][0]}
# # ,{'list_name':df5, 'rmse':df5['RMSE'][0]}
# # ,{'list_name':df6, 'rmse':df6['RMSE'][0]}
# # ,{'list_name':df7, 'rmse':df7['RMSE'][0]}
# # ,{'list_name':df8, 'rmse':df8['RMSE'][0]}
# # ,{'list_name':df9, 'rmse':df9['RMSE'][0]}
# # ,{'list_name':df10, 'rmse':df10['RMSE'][0]}
# # ,{'list_name':df11, 'rmse':df11['RMSE'][0]}
# # ]
# sorted_dic=sorted(dic_list, key=lambda dic_list: dic_list['rmse'], reverse=False)
# print(sorted_dic)
# list_name=[a['list_name'] for a in sorted_dic] # 打印出来的话就是整个dataframe
# print(list_name)
# df=pd.concat(list_name,axis=1)
# print(df)
# df.to_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))
# a=pd.read_csv(os.path.join(outpath, str(s.split('_',6)[1]) +'6666'+'.csv'))
# # pd.set_option('display.max_columns', None)
# df=pd.DataFrame(a.isna().sum().sort_values(ascending=False))
# print(a)
# # 直接用fillna来填,可行, 但还要填drivers!!!
# # 找rmse最低值 对应的来开始填补
# print(df.columns)
# 一
# b=a.loc[a['LE_Gap_filled'] > -9999, ['LE_Gap_filled','Drivers','RMSE']]
# a['Drivers']=a.loc[a['LE_Gap_filled'] == np.nan, ['Drivers']]
# a['Drivers'].fillna( b['Drivers'] ,inplace = True ) # 自立门户 新建第一个模型的Drivers
# print(a['Drivers'].describe())
# a['RMSE']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE']]
# a['RMSE'].fillna( b['RMSE'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE'].describe())
# b=a.loc[a['LE_Gap_filled.1'] > -9999, ['LE_Gap_filled.1', 'Drivers.1', 'RMSE.1']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.1']=a.loc[a['LE_Gap_filled.1'] == np.nan, ['Drivers.1']]
# a['Drivers.1'].fillna( b['Drivers.1'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.1'].describe())
# a['RMSE.1']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.1']]
# a['RMSE.1'].fillna( b['RMSE.1'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.1'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.1'], inplace=True) # LE Update
# df1=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df1)
# a['Drivers'].fillna(a['Drivers.1'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.1'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 二
# b=a.loc[a['LE_Gap_filled.2'] > -9999, ['LE_Gap_filled.2', 'Drivers.2', 'RMSE.2']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.2']=a.loc[a['LE_Gap_filled.2'] == np.nan, ['Drivers.2']]
# a['Drivers.2'].fillna( b['Drivers.2'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.2'].describe())
# a['RMSE.2']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.2']]
# a['RMSE.2'].fillna( b['RMSE.2'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.2'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.2'], inplace=True) # LE Update
# df2=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df2)
# a['Drivers'].fillna(a['Drivers.2'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.2'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 三
# b=a.loc[a['LE_Gap_filled.3'] > -9999, ['LE_Gap_filled.3', 'Drivers.3', 'RMSE.3']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.3']=a.loc[a['LE_Gap_filled.3'] == np.nan, ['Drivers.3']]
# a['Drivers.3'].fillna( b['Drivers.3'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.3'].describe())
# a['RMSE.3']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.3']]
# a['RMSE.3'].fillna( b['RMSE.3'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.3'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.3'], inplace=True) # LE Update
# df3=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df3)
# a['Drivers'].fillna(a['Drivers.3'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.3'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 四
# b=a.loc[a['LE_Gap_filled.4'] > -9999, ['LE_Gap_filled.4', 'Drivers.4', 'RMSE.4']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.4']=a.loc[a['LE_Gap_filled.4'] == np.nan, ['Drivers.4']]
# a['Drivers.4'].fillna( b['Drivers.4'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.4'].describe())
# a['RMSE.4']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.4']]
# a['RMSE.4'].fillna( b['RMSE.4'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.4'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.4'], inplace=True) # LE Update
# df4=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df4)
# a['Drivers'].fillna(a['Drivers.4'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.4'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 五
# b=a.loc[a['LE_Gap_filled.5'] > -9999, ['LE_Gap_filled.5', 'Drivers.5', 'RMSE.5']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.5']=a.loc[a['LE_Gap_filled.5'] == np.nan, ['Drivers.5']]
# a['Drivers.5'].fillna( b['Drivers.5'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.5'].describe())
# a['RMSE.5']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.5']]
# a['RMSE.5'].fillna( b['RMSE.5'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.5'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.5'], inplace=True) # LE Update
# df5=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df5)
# a['Drivers'].fillna(a['Drivers.5'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.5'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 六
# b=a.loc[a['LE_Gap_filled.6'] > -9999, ['LE_Gap_filled.6', 'Drivers.6', 'RMSE.6']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.6']=a.loc[a['LE_Gap_filled.6'] == np.nan, ['Drivers.6']]
# a['Drivers.6'].fillna( b['Drivers.6'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.6'].describe())
# a['RMSE.6']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.6']]
# a['RMSE.6'].fillna( b['RMSE.6'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.5'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.6'], inplace=True) # LE Update
# df6=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df6)
# a['Drivers'].fillna(a['Drivers.6'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.6'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 七
# b=a.loc[a['LE_Gap_filled.7'] > -9999, ['LE_Gap_filled.7', 'Drivers.7', 'RMSE.7']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.7']=a.loc[a['LE_Gap_filled.7'] == np.nan, ['Drivers.7']]
# a['Drivers.7'].fillna( b['Drivers.7'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.7'].describe())
# a['RMSE.7']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.7']]
# a['RMSE.7'].fillna( b['RMSE.7'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.7'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.7'], inplace=True) # LE Update
# df7=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df7)
# a['Drivers'].fillna(a['Drivers.7'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.7'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 八
# b=a.loc[a['LE_Gap_filled.8'] > -9999, ['LE_Gap_filled.8', 'Drivers.8', 'RMSE.8']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.8']=a.loc[a['LE_Gap_filled.8'] == np.nan, ['Drivers.8']]
# a['Drivers.8'].fillna( b['Drivers.8'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.8'].describe())
# a['RMSE.8']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.8']]
# a['RMSE.8'].fillna( b['RMSE.8'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.8'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.8'], inplace=True) # LE Update
# df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df8)
# a['Drivers'].fillna(a['Drivers.8'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.8'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 九
# b=a.loc[a['LE_Gap_filled.9'] > -9999, ['LE_Gap_filled.9', 'Drivers.9', 'RMSE.9']] # 只是有LE数值的地方,用来填充上边的空集
# a['Drivers.9']=a.loc[a['LE_Gap_filled.9'] == np.nan, ['Drivers.9']]
# a['Drivers.9'].fillna( b['Drivers.9'] ,inplace = True ) # 自立门户 新建第二个模型的Drivers
# print(a['Drivers.9'].describe())
# a['RMSE.9']=a.loc[a['LE_Gap_filled'] == np.nan, ['RMSE.9']]
# a['RMSE.9'].fillna( b['RMSE.9'] ,inplace = True ) # 自立门户 新建第一个模型的RMSE
# print(a['RMSE.9'].describe())
# a['LE_Gap_filled'].fillna(a['LE_Gap_filled.9'], inplace=True) # LE Update
# df8=pd.DataFrame(a.isna().sum().sort_values(ascending=False)) # 统计一下
# print(df8)
# a['Drivers'].fillna(a['Drivers.9'],inplace=True) # Drivers Update
# print(a['Drivers'].describe())
# a['RMSE'].fillna(a['RMSE.9'],inplace=True) # Rmse Update
# print(a['RMSE'].describe())
# # 加一下a的时间
# so=pd.read_csv(os.path.join(path1,s))
# so=so[['TIMESTAMP_START' ,'TIMESTAMP_END','LE_F_MDS']]
# print(a['TIMESTAMP_START'])
# print(a.shape)
# a['QC'] = np.nan
# a.loc[a['LE_Gap_filled'] > -9999, 'QC'] = 1
# a.loc[a['LE_Gap_filled'] == -9999 , 'QC'] = 0
# a['LE_Gap_filled'].replace(np.nan,-8888,inplace=True) # 原本是空值的部分 由于变量缺失过多,压根儿补不了的部分 在原数据集中,QC为3,表示的是ERA的数据
# a['LE_Gap_filled'].replace(-9999,np.nan,inplace=True) # | 空值还有种原因是 因为变量组合的原因,没有补到那一块,所以仍旧空
# a['LE_Gap_filled'].fillna(sole['LE_F_MDS'],inplace=True)# 最后依旧是空值
# a.loc[a['LE_Gap_filled'] == -8888 , 'QC'] = -9999
# print(a.dropna().shape[0]/a.shape[0])
# a=a[[ 'TIMESTAMP_END', 'LE_Gap_filled', 'QC', 'Drivers', 'RMSE']]
# a= pd.merge(so,a,how='outer',on='TIMESTAMP_END')
# a['LE_Gap_filled'].fillna(a['LE_F_MDS'],inplace=True)
# a['LE_Gap_filled'].replace(-8888,np.nan,inplace=True)
# a=a[['TIMESTAMP_START', 'TIMESTAMP_END', 'LE_Gap_filled', 'QC', 'Drivers', 'RMSE']]
# a['SW_IN_F_MDS']=np.nan
# a['NETRAD']=np.nan
# a['G_F_MDS']=np.nan
# a['TA_F_MDS']=np.nan
# a['RH']=np.nan
# a['WD']=np.nan
# a['WS']=np.nan
# a['PA_F']=np.nan
# a['VPD_F_MDS']=np.nan
# a['NDVI']=np.nan
# a['TS_F_MDS_1']=np.nan
# a['SWC_F_MDS_1']=np.nan
# a['TA_F_MDS']=np.nan
# a['Drivers'].replace(np.nan,-9999,inplace=True)
# b=a.loc[a['Drivers']!=-9999]
# # print(b)
# for i in b.columns[6:]:
# # print(i)
# c=b[b['Drivers'].str.contains(i)]
# c[i].replace(np.nan,'+',inplace=True)
# a[i]=c[i]
# b=a.count(axis=1)-6
# b=pd.DataFrame(b)
# a['n_drivers']=b
# a['n_drivers'].replace([-1,-2,-3],np.nan,inplace=True)
# a['Drivers'].replace(-9999,np.nan,inplace=True)
# # a.to_csv(os.path.join(path,sole+'.csv'),index = False)
# a.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME\FILLED',str(s.split('_',6)[1]) +'.csv'),index = False)
# # 创造空列
# # df["Empty_1"] = ""
# # df["Empty_2"] = np.nan
# # df['Empty_3'] = pd.Series()