题目:有一个全是芯片数据的数据集,里面有其第一次测试,第二次测试的结果以及是否合格,假设你是一个工厂的主管,请构建一个逻辑回归模型对芯片的使用情况进行预测。
代码如下:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # matplotlib.pyplot是一些命令行风格函数的集合,使matplotlib以类似于MATLAB的方式工作。
import seaborn as sns
import math
import scipy.optimize as opt
from sklearn.metrics import classification_report
# 定义函数sigmoid,方便后续假设函数以sigmoid(z)表示
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 定义代价函数
def computeCost (theta, X, y, l=1 ):
predictions = sigmoid(X @ theta) # 使用 @ 进行矩阵乘法
first = np.multiply(-y, np.log(predictions))
second = np.multiply((1 - y), np.log(1 - predictions))
theta_1 = theta[1:]
regularized_term = l / (2 * len(X)) * np.power(theta_1, 2).sum() #正则化项
return (np.sum(first - second) / len(X))+regularized_term # 返回标量
data = pd.read_csv('ex2data2.txt', names = ['test_1', 'test_2', 'accepted'])
sns.set(context='notebook', style='whitegrid')
sns.lmplot(x='test_1', y='test_2', hue='accepted', #定义数据子集的变量,绘制在图中
data=data,
height=6,#定义子图中每一格尺寸的高度,单位应为英寸
fit_reg=False, #不画拟合直线
scatter_kws={"s": 50}
#附加的关键字参数以字典的形式传递给plt.scatter()函数
#将plt.scatter()的参数s的值改为50,默认值为20 ,参数s代表散点的大小
)
plt.show()
def feature_mapping(x, y , power, as_ndarray=False):
data = {'f{}{}'.format(i-p, p): np.power(x, i-p)*np.power(y, p)
for i in np.arange(power+1)
for p in np.arange(i+1)
}
if as_ndarray:
return pd.DataFrame(data).values
else:
return pd.DataFrame(data)
x1 = np.array(data.test_1)
x2 = np.array(data.test_2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)
print(data.head(5))
theta = np.zeros(data.shape[1])
# X = np.array(data.iloc[:, :-1].values) # 特征矩阵 (100, 3)
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
y = np.array(data.iloc[:, -1].values) # 将 y 转换为 (100,)
# theta = np.matrix([0, 0, 0])
print(theta)
print(data)
print("X shape:", X.shape) # 应该是 (m, n)
print("y shape:", y.shape) # 应该是 (m, 1)
print("Initial theta shape:", theta.shape) # 应为 (m, 1)
print("Theta (flattened):", theta.flatten()) # 一维数组
print(computeCost(theta, X, y))
def gradient(theta, X, y, l=1):
theta_1 = theta[1:]
regularized_term = np.concatenate([np.array([0]), l * theta_1 / len(X)])
return (1/len(X))*(X.T@(sigmoid(X@theta)-y))+regularized_term
print(gradient(theta, X, y))
res = opt.minimize(fun=computeCost, x0=theta.flatten(), args=(X, y, l), method='Newton-CG', jac=gradient)
print(res)
def predict(X, theta):
probability = sigmoid(X@theta)
return [1 if x >= 0.5 else 0 for x in probability]
last_theta = res.x
y = y.astype(int)
y_pred = np.array(predict(X, last_theta)).astype(int)
print("y true type:", y.dtype) # 应该是 int
print("y pred type:", y_pred.dtype) # 应该是 int
print("y true shape:", y.shape) # 应该是 (118,)
print("y pred shape:", np.array(y_pred).shape) # 应该是 (118,)
print("y_pred:", y_pred)
print(classification_report(y, y_pred))
def find_theta(power, l):
path = 'ex2data2.txt'
data = pd.read_csv(path, header=None, names=['test_1', 'test_2', 'accepted'])
data.head()
Y = data.accepted
x1 = data.test_1.values
x2 = data.test_2.values
X = feature_mapping(x1, x2, power, as_ndarray=True)
theta = np.zeros(X.shape[1])
res = opt.minimize(fun=computeCost, x0=theta.flatten(), args=(X, Y, l), method='TNC', jac=gradient)
return res.x
# 找决策边界,thetaX = 0, thetaX <= threshhold
def find_decision_boundary(density, power, theta, threshhold):
t1 = np.linspace(-1, 1.2, density) # 1000个样本
t2 = np.linspace(-1, 1.2, density)
cordinates = [(x, y) for x in t1 for y in t2]
# 把坐标x和y独立出来
x_cord, y_cord = zip(*cordinates)
mapped_cord = feature_mapping(x_cord, y_cord, power)
pred = mapped_cord.values @ theta.T
# abs取绝对值
decision = mapped_cord[np.abs(pred) <= threshhold]
return decision.f10, decision.f01
# 画决策边界
def draw_boundary(power, l):
# 1000个样本
density = 1000
# 阈值(就是0.002对应就是我们所说的ε)
threshhold = 2 * 10 ** -3
# 调用了上面的两个函数
theta = find_theta(power, l)
x, y = find_decision_boundary(density, power, theta, threshhold)
path = 'ex2data2.txt'
data = pd.read_csv(path, header=None, names=['test_1', 'test_2', 'accepted'])
sns.set(context='notebook', style='whitegrid', palette='bright')
sns.lmplot(x='test_1', y='test_2', hue='accepted', # 定义数据子集的变量,绘制在图中
data=data,
height=6, # 定义子图中每一格尺寸的高度,单位应为英寸
fit_reg=False, # 不画拟合直线
scatter_kws={"s": 100}
)
plt.scatter(x, y, c='red', s=10)
plt.title('Decision Boundary') # 设置标题
plt.show()
draw_boundary(6, l=1)
输出如下:
(118, 28)
f00 f10 f01 f20 ... f33 f24 f15 f06
0 1.0 0.051267 0.69956 0.002628 ... 0.000046 0.000629 0.008589 0.117206
1 1.0 -0.092742 0.68494 0.008601 ... -0.000256 0.001893 -0.013981 0.103256
2 1.0 -0.213710 0.69225 0.045672 ... -0.003238 0.010488 -0.033973 0.110047
3 1.0 -0.375000 0.50219 0.140625 ... -0.006679 0.008944 -0.011978 0.016040
4 1.0 -0.513250 0.46564 0.263426 ... -0.013650 0.012384 -0.011235 0.010193
[5 rows x 28 columns]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]
f00 f10 f01 ... f24 f15 f06
0 1.0 0.051267 0.699560 ... 6.294709e-04 8.589398e-03 1.172060e-01
1 1.0 -0.092742 0.684940 ... 1.893054e-03 -1.398103e-02 1.032560e-01
2 1.0 -0.213710 0.692250 ... 1.048821e-02 -3.397345e-02 1.100469e-01
3 1.0 -0.375000 0.502190 ... 8.944062e-03 -1.197765e-02 1.604015e-02
4 1.0 -0.513250 0.465640 ... 1.238395e-02 -1.123519e-02 1.019299e-02
.. ... ... ... ... ... ... ...
113 1.0 -0.720620 0.538740 ... 4.374511e-02 -3.270412e-02 2.444980e-02
114 1.0 -0.593890 0.494880 ... 2.115493e-02 -1.762810e-02 1.468924e-02
115 1.0 -0.484450 0.999270 ... 2.340073e-01 -4.826843e-01 9.956280e-01
116 1.0 -0.006336 0.999270 ... 4.003286e-05 -6.313306e-03 9.956280e-01
117 1.0 0.632650 -0.030612 ... 3.514745e-07 -1.700678e-08 8.229060e-10
[118 rows x 28 columns]
X shape: (118, 28)
y shape: (118,)
Initial theta shape: (28,)
Theta (flattened): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0.]
0.6931471805599455
[ 0.3742744 0.02754347 -0.01138344 0.1101541 -0.0082709 0.04280266
0.02898635 0.00572266 0.00935661 -0.02715716 0.05754036 -0.00212496
0.01529654 -0.00205989 -0.01653743 0.02527397 0.00359114 0.00457371
0.00103564 0.00397128 -0.04114771 0.03779259 -0.00041107 0.00708275
-0.00039137 0.00311361 -0.00047758 -0.04007703]
fun: 0.12581644042843604
jac: array([-1.89698754e-08, -4.43334676e-08, 3.67869823e-08, -1.82914720e-08,
7.14343285e-09, -1.64723202e-08, -1.74613956e-08, 8.13692542e-09,
-4.96652423e-09, 2.53731197e-08, -9.03830565e-09, 3.99948787e-09,
-5.19088915e-09, 1.11943082e-09, 3.21779145e-08, -9.56322308e-09,
1.12350354e-09, -2.99585141e-09, 1.32421078e-09, -1.14541087e-08,
2.04749310e-08, -8.46148423e-10, 2.53035948e-09, -2.28404791e-09,
1.08952375e-09, -2.97163925e-09, 5.27178326e-10, 3.37589954e-08])
message: 'Optimization terminated successfully.'
nfev: 8
nhev: 0
nit: 7
njev: 45
status: 0
success: True
x: array([-3.99283764, 0.16069382, 0.20571814, -0.2709825 , -0.07035712,
1.14284717, -0.04841805, -0.12075286, 0.1585947 , 0.9478175 ,
-0.15049082, -0.04643481, -0.01713725, 0.03508623, 1.61527416,
-0.06366692, -0.04758619, 0.00649483, -0.05406478, 0.16566893,
1.6230761 , -0.09710025, -0.02486652, -0.01971631, -0.01895734,
0.01164191, 0.1041576 , 2.14295338])
y true type: int32
y pred type: int32
y true shape: (118,)
y pred shape: (118,)
y_pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 1 1 0]
precision recall f1-score support
0 1.00 0.92 0.96 116
1 0.18 1.00 0.31 2
accuracy 0.92 118
macro avg 0.59 0.96 0.63 118
weighted avg 0.99 0.92 0.95 118

原始数据的散点图

点击出来的决策边界
作业总结:
1、在调用classification_report时,可能会输出的预测值和训练值的数据类型不一样而报错,由于我们是分类问题,输出的应该是概率,也就是离散值而不应该是连续值,这里在输出时做了类型的转变,将其变为数组的形式。
2、之所以在梯度更新公式以及代价函数中没有对θ0 进行限制,是因为其可以理解为代表的是决策边界的截距,这个截距可以很小,但其大小是由样本决定的,如果正负样本本身的值就比较大,那么θ0 的值也就会变大。
3、在画决策边界时,并没有如同之前一样用线性回归,用的是类似于蒙特卡罗的思想,就是在一片区域内,假设有一百万个点,把上面的点带入到我们的θTX=0 中,如果等式成立,就代表点在曲线上。通过这种类似于点击的方式把曲线的大概形状描绘出来。
1万+

被折叠的 条评论
为什么被折叠?



