from numpy import *
def loadSimpData():
datMat = matrix([[1, 2.1], [2, 1.1], [1.3, 1], [1, 1], [2, 1]])
classLabels = [1, 1, -1, -1, 1]
return datMat, classLabels
# 单层决策树生成函数
# 通过阈值比较对数据进行分类,划分数据1 / -1
def stumpClassify(dataMatrix, dimen, threshVal, threshIneq):
retArray = ones((shape(dataMatrix)[0], 1))
if threshIneq == 'lt':
retArray[dataMatrix[:, dimen] <= threshVal] = -1.0
else:
retArray[dataMatrix[:, dimen] > threshVal] = -1.0
return retArray
# 遍历可能的输入
def buildStump(dataArr, classLabels, D):
dataMatrix = matrix(dataArr)
labelMat = matrix(classLabels).T
m, n = dataMatrix.shape
numSteps = 10.0
bestStump = {}
bestClassEst = matrix(zeros((m, 1)))
minError = inf
for i in range(n):
# 对于每一列的值
rangeMin = dataMatrix[:, i].min()
rangeMax = dataMatrix[:, i].max()
# 每一步长
stepSize = (rangeMax - rangeMin) / numSteps
for j in range(-1, int(numSteps) + 1):
# 大于小于的划分效果都尝试,从而找到最好的划分条件
for inequal in ['lt', 'gt']:
threshVal = rangeMin + (j * stepSize)
predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)
errArr = matrix(ones((m, 1)))
errArr[predictedVals == labelMat] = 0
#权重向量D的意义,权重低的样本对错误率计算影响小,即权重低的样本分类错了也没关系
weightedError = D.T * errArr
print("分割的列:" + str(i) + "阈值为:" + str(threshVal) + "符号:" + str(inequal) + "错误率:" + str(weightedError))
if weightedError < minError:
minError = weightedError
bestClassEst = predictedVals.copy()
bestStump['dim'] = i
bestStump['thresh'] = threshVal
bestStump['ineq'] = inequal
return bestStump, minError, bestClassEst
#后见adaBoost分类器 numIt =40为迭代次数
def adaBoostTrainDS(dataArr, classLabels, numIt =40):
weakClassArr = []
m =shape(dataArr)[0]
#m个样本,初始时每个样本的权重都是 1/m
D= matrix(ones((m,1))/m)
aggClassEst = matrix(zeros((m,1)))
for i in range(numIt):
bestStump, error, classEst = buildStump(dataArr, classLabels, D)
print("D:", D.T)
#此弱分类器的权重 (权重与错误率有关,错误率越大权重越小) 1e-16用于确保在没有错误时不会发生除零溢出
alpha = float(0.5*log((1.0-error)/maximum(error,1e-16)))
bestStump['alpha'] = alpha
weakClassArr.append(bestStump)
print("classEst:", classEst)
#为下一次迭代计算D
expon = multiply(-1*alpha*matrix(classLabels).T, classEst)
print("expon:", expon)
#exp()函数就是e的几次方
D = multiply(D, exp(expon))
D =D/D.sum()
print("D:", D.T)
#归一化计算错误率
aggClassEst += alpha*classEst
print("aggClassEst:", aggClassEst)
aggErrors = multiply(sign(aggClassEst) != matrix(classLabels).T, ones((m, 1)))
print("aggErrors:", aggErrors)
errorRate = aggErrors.sum() / m
print("errorRate:", errorRate)
if errorRate == 0.0: break
return weakClassArr
#利用多个弱分类器进行分类
def adaClassify(dataToClass, classifierArr):
dataMatrix = matrix(dataToClass)
m = shape(dataMatrix)[0]
aggClassEst = matrix(zeros((m,1)))
for i in range(len(classifierArr)):
classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], classifierArr[i]['ineq'])
aggClassEst += classifierArr[i]['alpha']*classEst
print("aggClassEst:", aggClassEst)
return sign(aggClassEst)
if __name__ == "__main__":
datMat, classLabels = loadSimpData()
#retArray = ones((shape(datMat)[0], 1))
#print(retArray)
#bestClassEst = matrix(zeros((4, 1)))
#print(bestClassEst)
print("=============")
# D = matrix(ones((5, 1)) / 5)
# print(D)
# print("=============")
# bestStump, minError, bestClassEst = buildStump(datMat, classLabels, D)
# print(bestStump)
# print("=============")
# print(minError)
# print("=============")
# print(bestClassEst)
weakClassArr = adaBoostTrainDS(datMat,classLabels,9)
print(weakClassArr)
print("======asd=======")
res = adaClassify([0,0], weakClassArr)
print(res)
08-14
12-24
09-30
2762
2762
06-21
978
978

被折叠的 条评论
为什么被折叠?



