AdaBoost(adaptive boosting 自适应boosting)的一般流程
(1)收集数据:可以使用任意方法收集数据
(2)准备数据:依赖于所使用的弱分类器类型,使用单层决策树,这种分类器可用处理任何数据类型。作为弱分类器,简单分类器的效果更好
(3)分析数据:可用使用任意方法
(4)训练算法:AdaBoost的大部分时间都在训练上,分类器将多次在同一数据集上训练弱分类器。
(5)测试算法:计算分类的错误率
(6)使用算法:同SVM一样,AdaBoost预测两个类别中的一个。如果想把它应用大多个类别场合,那么就要像多类SVM中的做法一样对AdaBoost进行修改
from numpy import *
def loadSimpData ( ) :
datMat = matrix(
[ [ 1 . , 2.1 ] ,
[ 2 . , 1.1 ] ,
[ 1.3 , 1 . ] ,
[ 1 . , 1 . ] ,
[ 2 . , 1 . ] ] )
classLabels = [ 1.0 , 1.0 , - 1.0 , - 1.0 , 1.0 ]
return datMat, classLabels
datMat, classLabels = loadSimpData( )
print ( datMat)
[[ 1. 2.1]
[ 2. 1.1]
[ 1.3 1. ]
[ 1. 1. ]
[ 2. 1. ]]
print ( classLabels)
[1.0, 1.0, -1.0, -1.0, 1.0]
基于单层决策树构建弱分类器的伪代码
将最小错误率minError设为+ $ \infty $
对数据集中的每一个特征(第一层循环):
对每个步长(第二层循环):
对每个不等号(第三层循环):
建立一棵单层决策树并利用加权数据集对它进行测试:
将如果错误率低于minError,则将当前单层决策树设为最佳单层决策树
返回最佳单层决策树
math. log( 0.97 / 0.03 )
3.4760986898352733
math. log( 0.95 / 0.05 )
2.9444389791664403
math. log( 0.95 / 0.05 )
2.9444389791664403
程序清单7-1 单层决策树生成函数
def stumpClassify ( dataMatrix, dimen, threshVal, threshIneq) :
retArray = ones( ( shape( dataMatrix) [ 0 ] , 1 ) )
if threshIneq == 'lt' :
retArray[ dataMatrix[ : , dimen] <= threshVal] = - 1.0
else :
retArray[ dataMatrix[ : , dimen] > threshVal] = - 1.0
return retArray
def buildStump ( dataArr, classLabels, D) :
dataMatrix = mat( dataArr) ; labelMat = mat( classLabels) . T
m, n = shape( dataMatrix)
numSteps = 10.0
bestStump = { }
bestClasEst = mat( zeros( ( m, 1 ) ) )
minError = inf
for i in range ( n) :
rangeMin = dataMatrix[ : , i] . min ( ) ; rangeMax = dataMatrix[ : , i] . max ( )
print ( rangeMin)
print ( rangeMax)
stepSize = ( rangeMax - rangeMin) / numSteps
print ( stepSize)
for j in range ( - 1 , int ( numSteps) + 1 ) :
for inequal in [ 'lt' , 'gt' ] :
print ( j)
threshVal = ( rangeMin + float ( j) * stepSize)
print ( threshVal)
predictedVals = stumpClassify( dataMatrix, i, threshVal, inequal)
errArr = mat( ones( ( m, 1 ) ) )
errArr [ predictedVals == labelMat] = 0
weightedError = D. T* errArr
print ( "split:dim %d,thresh %.2f,thresh inequal: %s,the weighted error is %.3f" % ( i, threshVal, inequal, weightedError) )
if weightedError < minError:
minError = weightedError
bestClasEst = predictedVals. copy( )
bestStump[ 'dim' ] = i
bestStump[ 'thresh' ] = threshVal
bestStump[ 'ineq' ] = inequal
return bestStump, minError, bestClasEst
D = mat( ones( ( 5 , 1 ) ) / 5 )
print ( D)
[[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]
[ 0.2]]
buildStump( datMat, classLabels, D)
1.0
2.0
0.1
-1
0.9
({'dim': 0, 'ineq': 'lt', 'thresh': 1.3}, matrix([[ 0.2]]), array([[-1.],
[ 1.],
[-1.],
[-1.],
[ 1.]]))
完整AdaBoost算法的伪代码
对每次迭代
利用buildStump()函数找到最佳单层决策树
将最佳单层决策树叫人到单层决策树组
计算alpha值
计算新的权重向量D
更新累计类别估计值
如果错误率等于0.0,则退出循环
程序清单7-2 基于单层决策树的AdaBoost训练过程
def adaBoostTrainDS ( dataArr, classLabels, numIt = 40 ) :
weakClassArr = [ ]
m = shape( dataArr) [ 0 ]
D = mat( ones( ( m, 1 ) ) / m)
aggClassEst = mat( zeros( ( m, 1 ) ) )
for i in range ( numIt) :
bestStump, error, classEst = buildStump( dataArr, classLabels, D)
print ( "D:" , D. T)
alpha = float ( 0.5 * log( ( 1.0 - error) / max ( error, 1e - 16 ) ) )
bestStump[ 'alpah' ] = alpha
weakClassArr. append( bestStump)
print ( "classEst:" , classEst. T)
print ( classLabels)
print ( "classLabels:" , - 1 * alpha* mat( classLabels) . T)
expon = multiply( - 1 * alpha* mat( classLabels) . T, classEst)
print ( "expon:" , exp( expon) )
print ( "shapeD" , shape( D) )
print ( "shapeE" , shape( exp( expon) ) )
D = multiply( D, exp( expon) )
print ( D)
D = D/ D. sum ( )
aggClassEst += alpha * classEst
print ( "aggClassEst:" , aggClassEst. T)
aggErrors = multiply( sign( aggClassEst) != mat( classLabels) . T, ones( ( m, 1 ) ) )
errorRate = aggErrors. sum ( ) / m
print ( "total Error:" , errorRate, "\n" )
if errorRate == 0.0 : break
return weakClassArr
classifyArray = adaBoostTrainDS( datMat, classLabels, 9 )
1.0
2.0
0.1
-1
0.9
D: [[ 0.2 0.2 0.2 0.2 0.2]]
classEst: [[-1. 1. -1. -1. 1.]]
[1.0, 1.0, -1.0, -1.0, 1.0]
classLabels: [[-0.69314718]
[-0.69314718]
[ 0.69314718]
[ 0.69314718]
[-0.69314718]]
expon: [[ 2. ]
[ 0.5]
[ 0.5]
[ 0.5]
[ 0.5]]
shapeD (5, 1)
shapeE (5, 1)
[[ 0.4]
[ 0.1]
[ 0.1]
[ 0.1]
[ 0.1]]
aggClassEst: [[-0.69314718 0.69314718 -0.69314718 -0.69314718 0.69314718]]
total Error: 0.2
1.0
2.0
0.1
-1
0.9
shapeD (5, 1)
shapeE (5, 1)
[[ 0.11664237]
[ 0.02916059]
[ 0.17496355]
[ 0.17496355]
[ 0.20412415]]
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
total Error: 0.0
def adaBoostTrainDS ( dataArr, classLabels, numIt = 40 ) :
weakClassArr = [ ]
m = shape( dataArr) [ 0 ]
D = mat( ones( ( m, 1 ) ) / m)
aggClassEst = mat( zeros( ( m, 1 ) ) )
for i in range ( numIt) :
bestStump, error, classEst = buildStump( dataArr, classLabels, D)
print ( "D:" , D. T)
alpha = float ( 0.5 * log( ( 1.0 - error) / max ( error, 1e - 16 ) ) )
bestStump[ 'alpha' ] = alpha
weakClassArr. append( bestStump)
print ( "classEst:" , classEst. T)
print ( classLabels)
print ( "classLabels:" , - 1 * alpha* mat( classLabels) . T)
expon = multiply( - 1 * alpha* mat( classLabels) . T, classEst)
print ( "expon:" , exp( expon) )
print ( "shapeD" , shape( D) )
print ( "shapeE" , shape( exp( expon) ) )
D = multiply( D, exp( expon) )
print ( D)
D = D/ D. sum ( )
aggClassEst += alpha * classEst
print ( "aggClassEst:" , aggClassEst. T)
aggErrors = multiply( sign( aggClassEst) != mat( classLabels) . T, ones( ( m, 1 ) ) )
errorRate = aggErrors. sum ( ) / m
print ( "total Error:" , errorRate, "\n" )
if errorRate == 0.0 : break
return weakClassArr, aggClassEst
classifyArray, aggClassEst = adaBoostTrainDS( datMat, classLabels, 9 )
1.0
2.0
0.1
-1
0.9
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
total Error: 0.0
classifyArray
[{'alpha': 0.6931471805599453, 'dim': 0, 'ineq': 'lt', 'thresh': 1.3},
{'alpha': 0.9729550745276565, 'dim': 1, 'ineq': 'lt', 'thresh': 1.0},
{'alpha': 0.8958797346140273,
'dim': 0,
'ineq': 'lt',
'thresh': 0.90000000000000002}]
程序清单7-3 AdoBoost分类函数
def adaClassify ( datToClass, classifierArr) :
dataMatrix = mat( datToClass)
m = shape( dataMatrix) [ 0 ]
aggClassEst = mat( zeros( ( m, 1 ) ) )
for i in range ( len ( classifierArr) ) :
classEst = stumpClassify( dataMatrix, classifierArr[ i] [ 'dim' ] , classifierArr[ i] [ 'thresh' ] , classifierArr[ i] [ 'ineq' ] )
aggClassEst += classifierArr[ i] [ 'alpha' ] * classEst
print ( aggClassEst)
return sign( aggClassEst)
datArr, labelArr = loadSimpData( )
classifierArr, aggClassEst = adaBoostTrainDS( datArr, labelArr, 30 )
1.0
2.0
0.1
-1
0.9
aggClassEst: [[ 1.17568763 2.56198199 -0.77022252 -0.77022252 0.61607184]]
total Error: 0.0
adaClassify( [ 0 , 0 ] , classifierArr)
[[-0.69314718]]
[[-1.66610226]]
[[-2.56198199]]
matrix([[-1.]])
程序清单7-4 自适应数据加载函数
def loadDataSet ( fileName) :
numFeat = len ( open ( fileName) . readline( ) . split( '\t' ) )
dataMat = [ ] ; labelMat = [ ]
fr = open ( fileName)
for line in fr. readlines( ) :
lineArr = [ ]
curLine = line. strip( ) . split( '\t' )
for i in range ( numFeat - 1 ) :
lineArr. append( float ( curLine[ i] ) )
dataMat. append( lineArr)
labelMat. append( float ( curLine[ - 1 ] ) )
return dataMat, labelMat
datArr, labelArr = loadDataSet( 'horseColicTraining2.txt' )
classifierArray = adaBoostTrainDS( datArr, labelArr, 10 )
testArr, testLabelArr = loadDataSet( 'horseColicTest2.txt' )
prediction10 = adaClassify( testArr, classifierArray)
errArr = mat( ones( ( 67 , 1 ) ) )
su = errArr[ prediction10 != mat( testLabelArr) . T] . sum ( )
print ( su)
程序清单7-5 ROC曲线的绘制及AUC计算函数
# ROC曲线绘制
def plotROC(predStrengths, classLabels):# 预测强度向量,样本标签
import matplotlib.pyplot as plt # 导入库
cur = (1.0,1.0) #cursor
ySum = 0.0 #variable to calculate AUC
numPosClas = sum(array(classLabels)==1.0)
yStep = 1/float(numPosClas); xStep = 1/float(len(classLabels)-numPosClas)
sortedIndicies = predStrengths.argsort()#get sorted index, it's reverse
fig = plt.figure()
fig.clf()
ax = plt.subplot(111)
#loop through all the values, drawing a line segment at each point
for index in sortedIndicies.tolist()[0]:
if classLabels[index] == 1.0:
delX = 0; delY = yStep;
else:
delX = xStep; delY = 0;
ySum += cur[1]
#draw line from cur to (cur[0]-delX,cur[1]-delY)
ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY], c='b')
cur = (cur[0]-delX,cur[1]-delY)
ax.plot([0,1],[0,1],'b--')
plt.xlabel('False positive rate'); plt.ylabel('True positive rate')
plt.title('ROC curve for AdaBoost horse colic detection system')
ax.axis([0,1,0,1])
plt.show()
print("the Area Under the Curve is: ",ySum*xStep)
dataArr, labelArr = loadDataSet("horseColicTraining.txt")
classifierArray,aggClassEst = adaBoostTrainDS(dataArr, labelArr,10)
plotROC(aggClassEst.T,labelArr)