从0开始学习机器学习--Day11--逻辑作业--正则化

题目:有一个全是芯片数据的数据集,里面有其第一次测试,第二次测试的结果以及是否合格,假设你是一个工厂的主管,请构建一个逻辑回归模型对芯片的使用情况进行预测。

代码如下:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt # matplotlib.pyplot是一些命令行风格函数的集合,使matplotlib以类似于MATLAB的方式工作。
import seaborn as sns
import math
import scipy.optimize as opt
from sklearn.metrics import classification_report

# 定义函数sigmoid,方便后续假设函数以sigmoid(z)表示
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# 定义代价函数
def computeCost (theta, X, y, l=1 ):
    predictions = sigmoid(X @ theta)  # 使用 @ 进行矩阵乘法
    first = np.multiply(-y, np.log(predictions))
    second = np.multiply((1 - y), np.log(1 - predictions))
    theta_1 = theta[1:]
    regularized_term = l / (2 * len(X)) * np.power(theta_1, 2).sum()   #正则化项
    return (np.sum(first - second) / len(X))+regularized_term  # 返回标量


data = pd.read_csv('ex2data2.txt', names = ['test_1', 'test_2', 'accepted'])

sns.set(context='notebook', style='whitegrid')
sns.lmplot(x='test_1', y='test_2', hue='accepted',   #定义数据子集的变量,绘制在图中
            data=data,
            height=6,#定义子图中每一格尺寸的高度,单位应为英寸
           fit_reg=False,  #不画拟合直线
           scatter_kws={"s": 50}
           #附加的关键字参数以字典的形式传递给plt.scatter()函数
           #将plt.scatter()的参数s的值改为50,默认值为20 ,参数s代表散点的大小
           )
plt.show()

def feature_mapping(x, y , power, as_ndarray=False):
    data = {'f{}{}'.format(i-p, p): np.power(x, i-p)*np.power(y, p)
                for i in np.arange(power+1)
                for p in np.arange(i+1)
            }
    if as_ndarray:
        return pd.DataFrame(data).values
    else:
        return pd.DataFrame(data)


x1 = np.array(data.test_1)
x2 = np.array(data.test_2)
data = feature_mapping(x1, x2, power=6)
print(data.shape)
print(data.head(5))



theta = np.zeros(data.shape[1])

# X = np.array(data.iloc[:, :-1].values)  # 特征矩阵 (100, 3)
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
y = np.array(data.iloc[:, -1].values)  # 将 y 转换为 (100,)

# theta = np.matrix([0, 0, 0])
print(theta)
print(data)

print("X shape:", X.shape)  # 应该是 (m, n)
print("y shape:", y.shape)  # 应该是 (m, 1)
print("Initial theta shape:", theta.shape)  # 应为 (m, 1)
print("Theta (flattened):", theta.flatten())  # 一维数组

print(computeCost(theta, X, y))


def gradient(theta, X, y, l=1):
    theta_1 = theta[1:]
    regularized_term = np.concatenate([np.array([0]), l * theta_1 / len(X)])
    return (1/len(X))*(X.T@(sigmoid(X@theta)-y))+regularized_term

print(gradient(theta, X, y))

res = opt.minimize(fun=computeCost, x0=theta.flatten(), args=(X, y, l), method='Newton-CG', jac=gradient)

print(res)

def predict(X, theta):
    probability = sigmoid(X@theta)
    return [1 if x >= 0.5 else 0 for x in probability]

last_theta = res.x
y = y.astype(int)
y_pred = np.array(predict(X, last_theta)).astype(int)
print("y true type:", y.dtype)  # 应该是 int
print("y pred type:", y_pred.dtype)  # 应该是 int
print("y true shape:", y.shape)  # 应该是 (118,)
print("y pred shape:", np.array(y_pred).shape)  # 应该是 (118,)
print("y_pred:", y_pred)
print(classification_report(y, y_pred))




def find_theta(power, l):
    path = 'ex2data2.txt'
    data = pd.read_csv(path, header=None, names=['test_1', 'test_2', 'accepted'])
    data.head()

    Y = data.accepted
    x1 = data.test_1.values
    x2 = data.test_2.values
    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=computeCost, x0=theta.flatten(), args=(X, Y, l), method='TNC', jac=gradient)
    return res.x


# 找决策边界,thetaX = 0, thetaX <= threshhold
def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.2, density)  # 1000个样本
    t2 = np.linspace(-1, 1.2, density)
    cordinates = [(x, y) for x in t1 for y in t2]
    # 把坐标x和y独立出来
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)

    pred = mapped_cord.values @ theta.T
    # abs取绝对值
    decision = mapped_cord[np.abs(pred) <= threshhold]

    return decision.f10, decision.f01

# 画决策边界
def draw_boundary(power, l):
    # 1000个样本
    density = 1000
    # 阈值(就是0.002对应就是我们所说的ε)
    threshhold = 2 * 10 ** -3

    # 调用了上面的两个函数
    theta = find_theta(power, l)
    x, y = find_decision_boundary(density, power, theta, threshhold)
    path = 'ex2data2.txt'
    data = pd.read_csv(path, header=None, names=['test_1', 'test_2', 'accepted'])

    sns.set(context='notebook', style='whitegrid', palette='bright')
    sns.lmplot(x='test_1', y='test_2', hue='accepted',  # 定义数据子集的变量,绘制在图中
               data=data,
               height=6,  # 定义子图中每一格尺寸的高度,单位应为英寸
               fit_reg=False,  # 不画拟合直线
               scatter_kws={"s": 100}
               )
    plt.scatter(x, y, c='red', s=10)
    plt.title('Decision Boundary')  # 设置标题
    plt.show()

draw_boundary(6, l=1)

输出如下:

(118, 28)
   f00       f10      f01       f20  ...       f33       f24       f15       f06
0  1.0  0.051267  0.69956  0.002628  ...  0.000046  0.000629  0.008589  0.117206
1  1.0 -0.092742  0.68494  0.008601  ... -0.000256  0.001893 -0.013981  0.103256
2  1.0 -0.213710  0.69225  0.045672  ... -0.003238  0.010488 -0.033973  0.110047
3  1.0 -0.375000  0.50219  0.140625  ... -0.006679  0.008944 -0.011978  0.016040
4  1.0 -0.513250  0.46564  0.263426  ... -0.013650  0.012384 -0.011235  0.010193

[5 rows x 28 columns]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
     f00       f10       f01  ...           f24           f15           f06
0    1.0  0.051267  0.699560  ...  6.294709e-04  8.589398e-03  1.172060e-01
1    1.0 -0.092742  0.684940  ...  1.893054e-03 -1.398103e-02  1.032560e-01
2    1.0 -0.213710  0.692250  ...  1.048821e-02 -3.397345e-02  1.100469e-01
3    1.0 -0.375000  0.502190  ...  8.944062e-03 -1.197765e-02  1.604015e-02
4    1.0 -0.513250  0.465640  ...  1.238395e-02 -1.123519e-02  1.019299e-02
..   ...       ...       ...  ...           ...           ...           ...
113  1.0 -0.720620  0.538740  ...  4.374511e-02 -3.270412e-02  2.444980e-02
114  1.0 -0.593890  0.494880  ...  2.115493e-02 -1.762810e-02  1.468924e-02
115  1.0 -0.484450  0.999270  ...  2.340073e-01 -4.826843e-01  9.956280e-01
116  1.0 -0.006336  0.999270  ...  4.003286e-05 -6.313306e-03  9.956280e-01
117  1.0  0.632650 -0.030612  ...  3.514745e-07 -1.700678e-08  8.229060e-10

[118 rows x 28 columns]
X shape: (118, 28)
y shape: (118,)
Initial theta shape: (28,)
Theta (flattened): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
0.6931471805599455
[ 0.3742744   0.02754347 -0.01138344  0.1101541  -0.0082709   0.04280266
  0.02898635  0.00572266  0.00935661 -0.02715716  0.05754036 -0.00212496
  0.01529654 -0.00205989 -0.01653743  0.02527397  0.00359114  0.00457371
  0.00103564  0.00397128 -0.04114771  0.03779259 -0.00041107  0.00708275
 -0.00039137  0.00311361 -0.00047758 -0.04007703]
     fun: 0.12581644042843604
     jac: array([-1.89698754e-08, -4.43334676e-08,  3.67869823e-08, -1.82914720e-08,
        7.14343285e-09, -1.64723202e-08, -1.74613956e-08,  8.13692542e-09,
       -4.96652423e-09,  2.53731197e-08, -9.03830565e-09,  3.99948787e-09,
       -5.19088915e-09,  1.11943082e-09,  3.21779145e-08, -9.56322308e-09,
        1.12350354e-09, -2.99585141e-09,  1.32421078e-09, -1.14541087e-08,
        2.04749310e-08, -8.46148423e-10,  2.53035948e-09, -2.28404791e-09,
        1.08952375e-09, -2.97163925e-09,  5.27178326e-10,  3.37589954e-08])
 message: 'Optimization terminated successfully.'
    nfev: 8
    nhev: 0
     nit: 7
    njev: 45
  status: 0
 success: True
       x: array([-3.99283764,  0.16069382,  0.20571814, -0.2709825 , -0.07035712,
        1.14284717, -0.04841805, -0.12075286,  0.1585947 ,  0.9478175 ,
       -0.15049082, -0.04643481, -0.01713725,  0.03508623,  1.61527416,
       -0.06366692, -0.04758619,  0.00649483, -0.05406478,  0.16566893,
        1.6230761 , -0.09710025, -0.02486652, -0.01971631, -0.01895734,
        0.01164191,  0.1041576 ,  2.14295338])
y true type: int32
y pred type: int32
y true shape: (118,)
y pred shape: (118,)
y_pred: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 0]
precision    recall  f1-score   support

           0       1.00      0.92      0.96       116
           1       0.18      1.00      0.31         2

    accuracy                           0.92       118
   macro avg       0.59      0.96      0.63       118
weighted avg       0.99      0.92      0.95       118

原始数据的散点图

点击出来的决策边界

作业总结:

1、在调用classification_report时,可能会输出的预测值和训练值的数据类型不一样而报错,由于我们是分类问题,输出的应该是概率,也就是离散值而不应该是连续值,这里在输出时做了类型的转变,将其变为数组的形式。

2、之所以在梯度更新公式以及代价函数中没有对θ0 进行限制,是因为其可以理解为代表的是决策边界的截距,这个截距可以很小,但其大小是由样本决定的,如果正负样本本身的值就比较大,那么θ0 的值也就会变大。

3、在画决策边界时,并没有如同之前一样用线性回归,用的是类似于蒙特卡罗的思想,就是在一片区域内,假设有一百万个点,把上面的点带入到我们的θTX=0 中,如果等式成立,就代表点在曲线上。通过这种类似于点击的方式把曲线的大概形状描绘出来。

作业改正参考视频:【作业讲解】编程作业2:逻辑回归+正则化(下)_哔哩哔哩_bilibili

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值