(一)中的算法在数据集庞大且特征较多的情况下,计算开销很大很复杂,故需要对算法稍作改进调整,避免之前的问题,且使所求系数尽快达到稳定值,一个好方法是采用随机梯度上升算法:
# 随机梯度上升算法
def stocgradascent(datamatrix, classlabels, number=100):
m, n = shape(datamatrix)
# 之前是矩阵乘法,每一个样本均与系数相乘;此处为数组乘法,系数与随机抽取的样本对应乘积
weights = ones(n)
dataindex = range(m)
for i in range(number):
for k in range(m):
# alpha会随着迭代次数不断减小但不会为0,总是对系数有所影响,反正就疯狂试探
alpha = 3 / (i + k) + 0.01
randindex = int(random.uniform(0, len(dataindex)))
h = sigmoid(sum(datamatrix[randindex] * weights))
error = classlabels[randindex] - h
weights = weights + alpha * error * datamatrix[randindex]
del(datamatrix[randindex])
return weights
之后再创建一个对结果进行判断的分类器可算就大功告成:
# 分类器
def classifyvector(inx, weights):
classresult = sigmoid(sum(inx * weights))
if classresult > 0.5:
return 1.0
else:
return 0.0
画出数据集和最佳拟合直线:
from matplotlib import pyplot as plt
def plotbestfit(weights):
# 之前没写这个函数
datamat, labelmat = loaddataset()
dataarray = array(datamat)
n = dataarray.shape()[0]
xcord1 = []; ycord1 = []
xcord2 = []; ycord2 = []
for i in range(n):
if int(labelmat[i]) == 1:
xcord1.append(dataarray[i,1]); ycord1.append(dataarray[i,2])
else:
xcord2.append(dataarray[i,1]); ycord2.append(dataarray[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1, ycord1, s=30, c='red', marker='s')
ax.scatter(xcord2, ycord2, s=30, c='blue')
#根据系数解方程组,求得x与y的关系
x = np.arange(-3.0,3.0,0.1)
y = -weights[0] - weights[1] * x) / weights[2]
ax.plot(x, y)
plt.xlabel('X1')
plt.ylabel('X2')
plot.show()