利用逻辑回归,决策树,svm计算准确率和AUC值
- 导入的包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import label_binarize
- 读取数据
# 读取数据集
data_all = pd.read_csv('/home/infisa/wjht/project/DataWhale/data_all.csv', encoding='gbk')
- 划分数据集
# 划分训练集和测试集
features = [x for x in data_all.columns if x not in ['status']]
X = data_all[features]
y = data_all['status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
- 构建模型
lr = LogisticRegression() # 逻辑回归模型
lr.fit(X_train, y_train)
tr = DecisionTreeClassifier() # 决策树模型
tr.fit(X_train, y_train)
svm = SVC() # SVM模型
svm.fit(X_train, y_train)
*模型评分
# 模型评分
lr_score = lr.score(X_test, y_test)
print(lr_score)
'lr_score:0.7484232655921513'
tr_score = tr.score(X_test, y_test)
'tr_score:0.6797477224947442'
svm_score = svm.score(X_test, y_test)
'svm_score:0.7484232655921513'
- 计算AUC值
# 计算auc值
y_test_hot = label_binarize(y_test,classes =(0, 1)) # 将测试集标签数据用二值化编码的方式转换为矩阵
lr_y_score=lr.decision_function(X_test) # 得到预测的损失值
svm_y_score = svm.decision_function(X_test) # 得到预测的损失值
lr_fpr,lr_tpr,lr_threasholds=metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel()) # 计算ROC的值,lr_threasholds为阈值
svm_fpr,svm_tpr,svm_threasholds=metrics.roc_curve(y_test_hot.ravel(),svm_y_score.ravel())#计算ROC的值,svm_threasholds为阈值
lr_auc=metrics.auc(lr_fpr,lr_tpr)
'lr_auc:0.5674626772245001'
svm_auc=metrics.auc(lr_fpr,lr_tpr)
'svm_auc:0.5674626772245001'