数据来源于《机器学习算法原理与编程实践》中的“testset.txt”
import numpy as np
import matplotlib.pyplot as plt
def scatter_data(data):
plt.figure(figsize=(10, 9))
for i in range(data.shape[0]):
if data[i, -1] == 0:
plt.scatter(data[i, 0], data[i, 1], marker='o', c='r', s=30)
else:
plt.scatter(data[i, 0], data[i, 1], marker='s', c='g', s=30)
plt.xlabel('first_column_feature', fontsize=12)
plt.ylabel('second_column_feature', fontsize=12)
plt.show()
return None
def plot_data(data, weight):
plt.figure(figsize=(10, 9))
plt.scatter(data[:, 0], data[:, 1], c=data[:, -1])
x = np.linspace(-3, 3, 100)
a = -weight[1]/weight[2] # 怎么利用权重
b = -weight[0]/weight[2]
y = a*x + b
plt.plot(x, y, c='r', lw=2)
plt.show()
return None
def logistic_function(fx, *threshold):
output = 1/(1 + np.exp(-fx))
# output[output > threshold] = 1
# output[output <= threshold] = 0
return output
def test_data(data, weight):
b = np.ones([data.shape[0], 1])
data_test = np.hstack([b, data])
print(data_test)
fx = np.dot(data_test, weight)
print(fx)
prob = 1/(1 + np.exp(-fx))
# if prob > 0.5:
# return 1.0
# else:
# return 0
print(prob)
prob[prob>0.5] = 0
prob[prob<=0.5] = 1.0
return prob
if __name__ == '__main__':
np.set_printoptions(suppress=True, precision=2)
data = np.loadtxt('./testset.txt')
# 分离数据集
x = data[:, :-1]
y = data[:, -1].reshape(-1, 1)
# 构建新的x,主要是包含了偏执bias
b = np.ones([x.shape[0], 1])
x = np.hstack([b, x])
# 绘制不同数据集的图像
# scatter_data(data)
#
# # 定义步长和迭代次数
alpha = 0.01
steps = 500
weight = np.ones([x.shape[1], 1]) # 初始化权重
#
# # 主程序:迭代过程
w_ls = []
for i in range(steps): # 为什么我做的权重没变
w_ls.append(weight)
fx = np.dot(x, weight)
output = logistic_function(fx, 0.5)
error = y - output
weight = weight + alpha*np.dot(x.T, error)
# if i == 30:
# break
# print(weight)
plot_data(data, weight)
# 对测试数据进行预测
x_test = np.array([[-0.1473, 2.8748], [-0, 0], [0, 10]])
test_label = test_data(x_test, weight)
print(test_label)
以上代码有几个不懂的地方:
1、这里的梯度下降和自己理解为倒数的好像不太一样,不像是一个倒数,而且权重的更新不太懂为什么要这么实现,为什么不是 weight = weight - alpha*np.dot(x.T, error),而是用+
2、输入一个特征,怎么利用权重算另外一个特征。因为按照自己往常的理解,是直接输入两个特征进行训练的。
3、logistic回归中什么大于阈值归为1还是0,搞不懂。
哎呀,虽然写出来了,但还是很迷茫,好多东西都没搞清楚,贼jer难受。