Logistic Reggression

最新推荐文章于 2020-05-22 14:30:07 发布

weixin_43579079

最新推荐文章于 2020-05-22 14:30:07 发布

阅读量216

点赞数

CC 4.0 BY-SA版权

分类专栏：机器学习

本文链接：https://blog.youkuaiyun.com/weixin_43579079/article/details/98894776

机器学习专栏收录该内容

11 篇文章

订阅专栏

1.关于逻辑

1.概率

定义：对一件事情发生可能性的衡量
范围：0-1
条件概率：在一件事已经发生非概率下，求另一件事发生的概率。依赖于条件发生的概率。

2.线性回归

得到的是线性关系，用线性表达式表达

3.基本模型

训练样本为 X（ $x_1$ , $x_2$ , $x_3$ , …, $x_n$ ）

学习的参数为 W （ $w_1$ , $w_2$ , $w_3$ , …, $w_n$ ）

Z = $w_1$ $x_1$ + $w_2$ $x_2$ + $w_3$ $x_3$ + … + $w_n$ $x_n$

向量表示

Z = $W^T$ X

sigmoid函数将线型转换成非线性

g(Z) = $11+e−z\frac{1}{1+e^{-z}}$
其作用为：将正负无穷的范围映射到【0,1】之内

目标：通过训练样本求出参数W使损失函数最小化

解法：梯度下降（gradient descent）
alpha:学习率，代表下降的快慢
同时更新所有W
迭代更新直到收敛（可以设置收敛阈值）

2.代码及注释

main.py

# -*- coding: utf-8 -*-

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from lr_tools import LogisticRegression, cal_acc


def run_main():
    """
        主函数
    """
    X, y = make_classification(    #生成样本
            n_samples=2000,   # 样本点个数
            n_features=100,    # 特征个数（维度）
            n_classes=2,   # 二分类
            random_state=17) # 随机种子

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,  #70%为训练集，30%为测试集
                                                        random_state=17)

    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    y_pred = lr_model.predict(X_test)

    print('真实值：', y_test)
    print('预测值：', y_pred)
    acc = cal_acc(y_test, y_pred)
    print('准确率：{:.2%}'.format(acc))


if __name__ == '__main__':
    run_main()

lr_tools.py

# -*- coding: utf-8 -*-

import numpy as np
from scipy.optimize import fmin_l_bfgs_b


class LogisticRegression(object):
    """
        Logistic Regression 类
    """
    def __init__(self, c=1.):
        self.c = c


    # 学习函数
    def fit(self, X, y):
        """
            训练模型
        """
        # 将初始值self_beta初始化
        self._beta = np.zeros((X.shape[1] + 1, 1))

        # 使用L-BFGS-B求最优化，输入损失函数，初始值，损失函数的参数
        result = fmin_l_bfgs_b(cost_func,               # 损失函数
                               self._beta,              # 初始值
                               args=(X, y, self.c))     # 损失函数的参数
		# 学习出来的值
        self._beta = result[0]
        return self

    def predict(self, X):
        """
            预测，返回标签
        """
        return np.argmax(self.predict_proba(X), axis=1)

    def predict_proba(self, X):
        """
            预测，返回概率
        """
        X = np.hstack((np.ones((X.shape[0], 1)), X))
        XBeta = np.dot(X, self._beta).reshape((-1, 1))

        probs = 1. / (1. + np.exp(-XBeta))  #signmoid函数处理，计算出概率
        return np.hstack((1 - probs, probs)) # 返回正负样本的概率


def cost_func(beta, X, y, C):
    """
        损失函数/目标函数
        返回 正则化的负对数似然值 及 梯度值
    """

    # 给X加一列1，便于计算
    X = np.hstack((np.ones((X.shape[0], 1)), X))
    # 转成列向量
    y = y.reshape((-1, 1))

    # 预先计算XBeta
    XBeta = np.dot(X, beta).reshape((-1, 1))

    # 预先计算Xbeta的exp值
    exp_XBeta = np.exp(XBeta)

    # 负对数似然值，定义损失函数
    # neg_ll = C*np.sum(np.log(1. + exp_XBeta) - y*XBeta, axis=0) + 0.5*np.inner(beta, beta)
    neg_ll = C * np.sum(np.log(1. + exp_XBeta) - y * XBeta, axis=0)

    # 负对数似然值得梯度
    grad_neg_ll = C*np.sum((1. / (1. + exp_XBeta))*exp_XBeta*X - y*X, axis=0) + beta

    return neg_ll, grad_neg_ll


def cal_acc(true_labels, pred_labels):
    """
        计算准确率
    """
    n_total = len(true_labels)
    correct_list = [true_labels[i] == pred_labels[i] for i in range(n_total)]

    acc = sum(correct_list) / n_total  # 计算准确率
    return acc