导入所需包
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os
os.chdir("E:\\Academic\\Graduation thesis\\PSSP_experiment")
import datetime
start_time=datetime.datetime.now()
载入特征集
import pickle
#pkl_file_name='CB513_AAC_pssm_X.pkl'
pkl_file_name='CB513_AAC_pssm_40_not_C_X.pkl'#66340
#pkl_file_name='CB513_AAC_pssm_head_tail_flag_X.pkl'
with open(pkl_file_name, "rb") as f:
x=pickle.load(f)
#pkl_file_name='CB513_AAC_pssm_Y.pkl'
pkl_file_name='CB513_AAC_pssm_not_C_Y.pkl'#66340
#pkl_file_name='CB513_AAC_pssm_not_C_Y.pkl'
#pkl_file_name='CB513_AAC_pssm_Y_onehot.pkl'
with open(pkl_file_name, "rb") as f:
y=pickle.load(f)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)
#print('the proportition of label 1 in y_test: %.2f%%'%(len(y_test[y_test==1])/len(y_test)*100))
#数据标准化
ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)
构建模型进行网格搜索
由于对模型参数不熟练,只好边看文档边选
parameters_lr={'solver':['newton-cg','lbfgs','liblinear','sag'],'C':[0.1,1,10]}
lr_model=GridSearchCV(LogisticRegression(class_weight='balanced',max_iter=10000),parameters_lr,cv=5,scoring='roc_auc')
parameters_svm={'kernel':['linear','rbf','poly'],'C':[0.1,1,10]}
svm_model=GridSearchCV(SVC(class_weight='balanced',gamma='auto',probability=True),parameters_svm,cv=5,scoring='roc_auc')
parameters_dt={'criterion':['gini','entropy'],'max_features':['sqrt','log2',None]}
dt_model=GridSearchCV(DecisionTreeClassifier(class_weight='balanced'),parameters_dt,cv=5,scoring='roc_auc')
parameters_en={'n_estimators':range(10,100,10)}
rf_model=GridSearchCV(RandomForestClassifier(class_weight='balanced'),parameters_en,cv=5,scoring='roc_auc')
gbdt_model=GridSearchCV(GradientBoostingClassifier(),parameters_en,cv=5,scoring='roc_auc')
xgb_model=GridSearchCV(XGBClassifier(),parameters_en,cv=5,scoring='roc_auc')
#lgbm_model=GridSearchCV(LGBMClassifier(),parameters_en,cv=5,scoring='roc_auc')
models={'LR':lr_model,
'SVM':svm_model,
'DT':dt_model,
'RF':rf_model,
'GBDT':gbdt_model,
'XGBoost':xgb_model}
# 'LGBM':lgbm_model}
定义评估模型函数
df_result=pd.DataFrame(columns=('model','dataset','accuracy','precision','recall','f1_score','auc'))
row=0
def evaluate(y_pre,y,y_proba):
acc=accuracy_score(y,y_pre)
p=precision_score(y,y_pre)
r=recall_score(y,y_pre)
f1=f1_score(y,y_pre)
fpr,tpr,thresholds=roc_curve(y,y_proba[:,1])
model_auc=auc(fpr,tpr)
return acc,p,r,f1,fpr,tpr,model_auc
def plot_roc_curve(fpr,tpr,label=None):
#plt.figure(figsize=(8,6))
plt.plot(fpr,tpr,label=label)
plt.plot([0,1],[0,1],'k--')
plt.axis([0,1,0,1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Poisitive Rate')
plt.legend()
训练模型并做评估
#plt.figure(figsize=(8,6))
for name,model in models.items():
print(name,'start training...')
model.fit(X_train,y_train)
print(model.best_params_)
y_pred_test=model.predict(X_test)
y_proba_test=model.predict_proba(X_test)
acc,p,r,f1,fpr_test,tpr_test,auc_test=evaluate(y_pred_test,y_test,y_proba_test)
df_result.loc[row]=[name,'test',acc,p,r,f1,auc_test]
row+=1
y_pred_train=model.predict(X_train)
y_proba_train=model.predict_proba(X_train)
acc,p,r,f1,fpr_train,tpr_train,auc_train=evaluate(y_pred_train,y_train,y_proba_train)
df_result.loc[row]=[name,'train',acc,p,r,f1,auc_train]
row+=1
plot_roc_curve(fpr_test,tpr_test,label=name)
#plot_roc_curve(fpr_train,tpr_train,label=name)
print(df_result)
plt.show()
#end_time
end_time=datetime.datetime.now()
print((end_time-start_time).seconds,'seconds')
这篇博客介绍了如何导入必要的Python包,加载特征集,通过网格搜索构建和优化模型,并定义了评估模型的函数来训练和评估模型。
1630

被折叠的 条评论
为什么被折叠?



