OverView:
In this chapter, I’m gonna do performance stuffs for machine learning algorithms. From previous chapter, I have already built four model:
SVM-RBF, SVM-PLOY, Bagging, AdaBoost.
Step 1: Confusion Matrix
SVM-RBF (without normalization and with normalization)


SVM-POLY (without normalization and with normalization)


Bagging (without normalization and with normalization)


Adaboost (without normalization and with normalization)


Step 2: Metrics
Based on the confusion matrix, I will calculate:
- F-measure score
- accuracy
- recall ( sensitivity, true positive rate )
- precision ( positive predictive value )
- AUC ( Area under the ROC curve )
–> This is a screen shot for all the metrics of each algorithm.

–> Put them into excel

–> Rank them to have an intuitive unstanding

Step 3: Plot ROC Curve

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import tree
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score, roc_auc_score, \
roc_curve
## load data
from plot_confusion_matrix import plot_confusion_matrix
trainSet = pd.read_csv("clevelandtrain.csv")
testSet = pd.read_csv("clevelandtest.csv")
xtrain = (trainSet.drop(["heartdisease::category|0|1"], axis=1)).iloc[:,:].values # (152, 13)
ytrain = trainSet["heartdisease::category|0|1"].iloc[:].values # (152,)
xtest = (testSet.drop(["heartdisease::category|0|1"], axis=1)).iloc[:,:].values # (145, 13)
ytest = testSet["heartdisease::category|0|1"].iloc[:].values # (145,)
## data preprocessing
# without: one-hot-encoder:
# xTrain = xtrain
# yTrain = ytrain
# xTest = xtest
# yTest = ytest
# one-hot-encoder: #9 (cp), #19 (restecg), #41 (slope), #51 (thal)
xtrain_pre = trainSet.drop(["cp", "restecg", "slope", "thal", "heartdisease::category|0|1"], axis=1).iloc[:,:].values # (152, 9)
xtrain_cp = trainSet["cp"].iloc[:].values
xtrain_restecg = trainSet["restecg"].iloc[:].values
xtrain_slope = trainSet["slope"].iloc[:].values
xtrain_thal = trainSet["thal"].iloc[:].values
ohe1 = OneHotEncoder(sparse = False,categories='auto',handle_unknown='ignore')
ohe2 = OneHotEncoder(sparse = False,categories='auto',handle_unknown='ignore')
ohe3 = OneHotEncoder(sparse = False,categories='auto',handle_unknown='ignore')
ohe4 = OneHotEncoder(sparse = False,categories='auto',handle_unknown='ignore')
xtrain_cp = ohe1.fit_transform(xtrain_cp.reshape(-1,1)) # (152, 4)
xtrain_restecg = ohe2.fit_transform(xtrain_restecg.reshape(-1,1)) # (152, 3)
xtrain_slope = ohe3.fit_transform(xtrain_slope.reshape(-1,1)) # (152, 3)
xtrain_thal = ohe4.fit_transform(xtrain_thal.reshape(-1,1)) # (152, 3)
xTrain = np.hstack((xtrain_pre, xtrain_cp, xtrain_restecg, xtrain_slope, xtrain_thal)) # (152, 22)
yTrain = ytrain # (152,)
xtest_pre = testSet.drop(["cp", "restecg", "slope", "thal", "heartdisease::category|0|1"], axis=1).iloc[:,:].values # (145, 9)
xtest_cp = testSet["cp"].iloc[:].values
xtest_restecg = testSet["restecg"].iloc[:].values
xtest_slope = testSet["slope"].iloc[:].values
xtest_thal = testSet["thal"].iloc[:].values
xtest_cp = ohe1.transform(xtest_cp.reshape(-1,1)) # (145, 4)
xtest_restecg = ohe2.transform(xtest_restecg.reshape(-1,1)) # (145, 3)
xtest_slope = ohe3.transform(xtest_slope.reshape(-1,1)) # (145, 3)
xtest_thal = ohe4.transform(xtest_thal.reshape(-1,1)) # (145, 3)
xTest = np.hstack((xtest_pre, xtest_cp, xtest_restecg, xtest_slope, xtest_thal)) # (145, 22)
yTest = ytest # (145,)
print("-----------------------------------------------------------------")
class_names = np.array([1,0])
svcRBF = SVC(C=300.0,gamma=0.0001,kernel='rbf',probability=True)
svcRBF.fit(xTrain,yTrain)
svcRBFScore = svcRBF.score(xTest, yTest)
prediction_svcRBF = svcRBF.predict(xTest)
# # Plot non-normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_svcRBF, classes=class_names, title='Confusion matrix svcRBF, without normalization')
#
# # Plot normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_svcRBF, classes=class_names, normalize=True,
# title='Normalized confusion matrix svcRBF')
#
# plt.show()
#
#
# f1 = f1_score(yTest, prediction_svcRBF)
# acc1 = accuracy_score(yTest, prediction_svcRBF)
# rec1 = recall_score(yTest, prediction_svcRBF)
# pre1 = precision_score(yTest, prediction_svcRBF)
# auc1 = roc_auc_score(yTest, prediction_svcRBF)
# print(f1)
# print(acc1)
# print(rec1)
# print(pre1)
# print(auc1)
# print("-----------------------------------------------------------------")
svcPoly = SVC(C=1.0,degree = 8.666666,coef0=1.0,gamma = 'scale',max_iter=-1,kernel='poly',probability=True)
svcPoly.fit(xTrain,yTrain)
svcPolyScore = svcPoly.score(xTest, yTest)
prediction_svcPLOY = svcPoly.predict(xTest)
# # Plot non-normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_svcPLOY, classes=class_names, title='Confusion matrix svcPLOY, without normalization')
#
# # Plot normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_svcPLOY, classes=class_names, normalize=True,
# title='Normalized confusion matrix svcPLOY')
#
# plt.show()
#
# f2 = f1_score(yTest, prediction_svcPLOY)
# acc2 = accuracy_score(yTest, prediction_svcPLOY)
# rec2 = recall_score(yTest, prediction_svcPLOY)
# pre2 = precision_score(yTest, prediction_svcPLOY)
# auc2 = roc_auc_score(yTest, prediction_svcPLOY)
# print(f2)
# print(acc2)
# print(rec2)
# print(pre2)
# print(auc2)
# print("-----------------------------------------------------------------")
decisonTree = tree.DecisionTreeClassifier()
decisonTreeBagging = BaggingClassifier(decisonTree,max_samples=0.7, max_features=1.0)
decisonTreeBagging.fit(xTrain,yTrain)
Bagging_score = decisonTreeBagging.score(xTest,yTest)
prediction_Bagging = decisonTreeBagging.predict(xTest)
# # Plot non-normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_Bagging, classes=class_names, title='Confusion matrix decisonTreeBagging, without normalization')
#
# # Plot normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_Bagging, classes=class_names, normalize=True,
# title='Normalized confusion matrix decisonTreeBagging')
#
# plt.show()
#
# f3 = f1_score(yTest, prediction_Bagging)
# acc3 = accuracy_score(yTest, prediction_Bagging)
# rec3 = recall_score(yTest, prediction_Bagging)
# pre3 = precision_score(yTest, prediction_Bagging)
# auc3 = roc_auc_score(yTest, prediction_Bagging)
# print(f3)
# print(acc3)
# print(rec3)
# print(pre3)
# print(auc3)
# print("-----------------------------------------------------------------")
decisonTreeAda = AdaBoostClassifier(decisonTree,n_estimators=10,random_state=np.random.RandomState(1))
decisonTreeAda.fit(xTrain,yTrain)
AdaBoost_score = decisonTreeAda.score(xTest,yTest)
prediction_AdaBoost = decisonTreeAda.predict(xTest)
# # Plot non-normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_AdaBoost, classes=class_names, title='Confusion matrix AdaBoost, without normalization')
#
# # Plot normalized confusion matrix
# plot_confusion_matrix(yTest, prediction_AdaBoost, classes=class_names, normalize=True,
# title='Normalized confusion matrix AdaBoost')
#
# plt.show()
#
#
# f4 = f1_score(yTest, prediction_AdaBoost)
# acc4 = accuracy_score(yTest, prediction_AdaBoost)
# rec4 = recall_score(yTest, prediction_AdaBoost)
# pre4 = precision_score(yTest, prediction_AdaBoost)
# auc4 = roc_auc_score(yTest, prediction_AdaBoost)
# print(f4)
# print(acc4)
# print(rec4)
# print(pre4)
# print(auc4)
# print("-----------------------------------------------------------------")
## plot roc curve
prediction_prob_svmRBF = svcRBF.predict_proba(xTest)[:, 1]
prediction_prob_svmPOLY = svcPoly.predict_proba(xTest)[:, 1]
prediction_prob_Bagging = decisonTreeBagging.predict_proba(xTest)[:, 1]
prediction_prob_Adaboost = decisonTreeAda.predict_proba(xTest)[:, 1]
fpr_svmRBF, tpr_svmRBF, _ = roc_curve(yTest, prediction_prob_svmRBF)
fpr_svmPOLY, tpr_svmPOLY, _ = roc_curve(yTest, prediction_prob_svmPOLY)
fpr_Bagging, tpr_Bagging, _ = roc_curve(yTest, prediction_prob_Bagging)
fpr_Adaboost, tpr_Adaboost, _ = roc_curve(yTest, prediction_prob_Adaboost)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_svmRBF, tpr_svmRBF, label='SVM-RBF')
plt.plot(fpr_svmPOLY, tpr_svmPOLY, label='SVM-POLY')
plt.plot(fpr_Bagging, tpr_Bagging, label='Bagging')
plt.plot(fpr_Adaboost, tpr_Adaboost, label='Adaboost')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
本文详细介绍了使用四种机器学习算法(SVM-RBF、SVM-POLY、Bagging、AdaBoost)对心脏病数据集进行预测分析的过程。通过构建混淆矩阵、计算F1分数、准确率、召回率、精确率和AUC等指标,对比了不同算法的性能,并绘制了ROC曲线以直观展示算法的分类效果。

被折叠的 条评论
为什么被折叠?



