逻辑回归

最新推荐文章于 2023-06-25 15:37:57 发布

OceanProo

最新推荐文章于 2023-06-25 15:37:57 发布

阅读量481

点赞数

CC 4.0 BY-SA版权

分类专栏： python 机器学习

本文链接：https://blog.youkuaiyun.com/YeChao3/article/details/84134050

python 同时被 2 个专栏收录

65 篇文章

订阅专栏

机器学习

20 篇文章

订阅专栏

本文介绍了逻辑回归的原理，包括其作为线性回归模型的特性。内容涵盖逻辑回归的计算过程，以及如何使用scikit-learn库进行实现。文章通过实例展示了如何训练和预测模型，并探讨了正则化、交叉验证、损失函数、优化算法和模型评估指标等关键概念。

简介

计算过程

scikit-learn实现

简介

Logistic回归，尽管它的名字是一个分类，但是属于回归的线性模型。Logistic回归在文献中也称为logit回归，最大熵分类（MaxEnt）或对数线性分类器。

计算过程

。。。

scikit-learn实现

linear_model.LogisticRegression，Logistic回归分类器。实现可以适合二元，一对多或多元逻辑回归与可选的L2或L1正则化。

linear_model.LogisticRegressionCV，Logistic回归CV分类器。使用内置交叉验证实现Logistic回归，以找出最佳C参数

这是一个简单的逻辑回归案例，帮助理解训练、预测模型：

import numpy as np
from sklearn.datasets import make_moons  #制作两个交叉的半圈
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# 定义决策边界制图函数
def plot_decision_boundary(pred_func):
 
    # 设定最大最小值，附加一点点边缘填充
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    #使用meshgrid生成坐标矩阵
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
 
    # 用预测函数预测一下
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])  #返回一个扁平数组
    Z = Z.reshape(xx.shape)
 
    # 然后画出图
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)  #轮廓图，即填充区域颜色
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)  #散点图
    
#生成数据，并制图
np.random.seed(0)
X,y=make_moons(500,noise=0.2)  #生成两个半园型数组
X.shape
plt.scatter(X[:,0],X[:,1],s=40,c=y,cmap=plt.cm.Spectral)
plt.title('The Data Distribution')
plt.show()

clf=LogisticRegression()  #实例化逻辑回归
clf.fit(X,y)  #训练模型

plot_decision_boundary(lambda x:clf.predict(x))  #进行预测，给定模型预测X的目标值（低级方法）
plt.title('Logistic Regression')
plt.show()

这是一个预测模型案例，帮助理解逻辑回归：

import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model  #广义线性模型，利用最小角度回归和坐标下降计算的岭回归，贝叶斯回归
from sklearn import decomposition  #矩阵分解，矩阵分解算法，包括PCA，NMF或ICA。该模块的大多数算法可以被视为降维技术
from sklearn import datasets  #数据集，包括加载和获取常用参考数据集的方法。它还具有一些人工数据生成器
from sklearn.pipeline import Pipeline  #管道，实现了用于构建复合估计器的实用程序，作为变换和估计器链
from sklearn.model_selection import GridSearchCV  #超参数优化器
#model_selection和grid_search是迭代了？？
logistic = linear_model.LogisticRegression()  #实例化逻辑回归
pca = decomposition.PCA()  #主成分分析
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])  #使用最终估算器进行变换的流水线。

digits = datasets.load_digits()  #下载数据
X_digits = digits.data
y_digits = digits.target

pca.fit(X_digits)  #训练模型

#制图
plt.figure(1, figsize=(4, 3))
plt.clf()  #清楚当前数据
plt.axes([.2, .2, .7, .7])  #从图中移除Axes 斧头（默认为当前轴）
plt.plot(pca.explained_variance_, linewidth=2)  #绘制y与x作为线和/或标记
plt.axis('tight')  #获取或设置某些轴属性的便捷方法
plt.xlabel('n_components')  #x轴标题
plt.ylabel('explained_variance_')  #y轴标题

#预测
n_components = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)  #返回在对数刻度上均匀间隔的数字

#Parameters of pipelines can be set using ‘__’ separated parameter names:
#管道的Parameters可使用'__'分隔的参数名被设定

estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_components,
                              logistic__C=Cs))  #超参数优化器

estimator.fit(X_digits, y_digits)  #训练模型

plt.axvline(estimator.best_estimator_.named_steps['pca'].n_components,
            linestyle=':', label='n_components chosen')  #在轴上添加垂直线
plt.legend(prop=dict(size=12))  #在图中放置一个图例
plt.show()

otto项目

default Logistic Regression(正则化的 Logistic Regression及参数调优)

用LogisticRegressionCV实现正则化的 Logistic Regression(L1正则，L2正则)名，#LogisticRegressionCV比GridSearchCV快

import pandas as pd 
import numpy as np
from sklearn.metrics import log_loss  #竞赛的评价指标为logloss
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import zero_one_loss
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']
train = pd.read_csv('./data/train/train.csv',index_col=['id'])
train['target'].value_counts().plot(kind='bar')  # Target 分布，看看各类样本分布是否均衡
plt.ylabel('各类别占比')

y = train['target']   #形式为Class_x
y = y.map(lambda s: s[6:])
y = y.map(lambda s: int(s)-1)  # 将类别字符串变成数字
X = train.drop(['target'],axis=1)
#训练模型
lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train,y_train)
y_predict = lr.predict(X_test)
loss = classification_report(y_test,y_predict)
print(loss)

# 交叉验证用于评估模型性能和进行参数调优（模型选择）
#分类任务中交叉验证缺省是采用StratifiedKFold
from sklearn.cross_validation import cross_val_score
loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy',n_jobs=-1)
print('logloss of each fold is: ',loss)
print('cv logloss is:', loss.mean())

param_grid = {'penalty':['l1','l2'],
             'C':[0.001,0.01,0.1,1,10,100,200]}

lr_penalty= LogisticRegression(class_weight='balanced',n_jobs=-1,solver='saga')#实例化模型
grid= GridSearchCV(lr_penalty, param_grid,cv=5, scoring='accuracy')
grid.fit(X_train,y_train)
print(grid.cv_results_）#查看网格搜索的结果
print(grid.best_score_)#查看最高的分数
print(grid.best_params_)#查看最好的超参数

# plot CV误差曲线
test_means = grid.cv_results_[ 'mean_test_score' ]
# test_stds = grid.cv_results_[ 'std_test_score' ]
train_means = grid.cv_results_[ 'mean_train_score' ]
# train_stds = grid.cv_results_[ 'std_train_score' ]


# plot results
n_Cs = len(Cs)
number_penaltys = len(penaltys)
test_scores = np.array(test_means).reshape(n_Cs,number_penaltys)
train_scores = np.array(train_means).reshape(n_Cs,number_penaltys)
# test_stds = np.array(test_stds).reshape(n_Cs,number_penaltys)
# train_stds = np.array(train_stds).reshape(n_Cs,number_penaltys)

x_axis = np.log10(Cs)
for i, value in enumerate(penaltys):
    #pyplot.plot(log(Cs), test_scores[i], label= 'penalty:'   + str(value))
    pyplot.errorbar(x_axis, test_scores[:,i], yerr=test_stds[:,i] ,label = penaltys[i] +' Test')
    pyplot.errorbar(x_axis, train_scores[:,i], yerr=train_stds[:,i] ,label = penaltys[i] +' Train')
    
pyplot.legend()
pyplot.xlabel( 'log(C)' )                                                                                                      
pyplot.ylabel( 'loss' )
pyplot.savefig('LogisticGridSearchCV_C.png' )

pyplot.show()

'''读取数据，查看基本信息'''
import pandas as pd 
import numpy as np
from sklearn.metrics import log_loss  #评价指标为logloss--没用到！
from sklearn.model_selection import cross_val_score  #通过交叉验证评估分数  
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.sans-serif'] = ['SimHei']
# 读取数据,查看数据基本信息
train = pd.read_csv('otto/Otto_train.csv',index_col=['id'])
train.head()
train.shape
train.info()
train.describe
# Target 分布，看看各类样本分布是否均衡,使用pandas制图
train['target'].value_counts().plot(kind='bar')
plt.ylabel('各类别占比')

#特征工程：特征编码和数据分割，各类样本不均衡。交叉验证对分类任务缺省的是采用StratifiedKFold，在每折采样时根据各类样本按比例采样
y_train = train['target']   #形式为Class_x，特征目标为target
y_train = y_train.map(lambda s: s[6:])  #根据分布图，取占比高的前六个特征
y_train = y_train.map(lambda s: int(s)-1)  # 将类别字符串变成数字

train = train.drop('target', axis=1)  #训练集去除特征目标target数据则为训练样本数据
X_train = np.array(train)  #数组化训练样本

#如果计算资源有限，也可只取少量样本，如取前1000个样本
#（分类中其实还需要确保取出来的这部分样本各类样本的比例和总体一致）
#n_trains = 1000
#y_train = train.label.values[:n_trains]
#print(y_train.unique())
#或者考虑用train_test_split而不是交叉验证来验证模型性能
#将数据分割训练数据与测试数据，# 随机采样20%的数据构建测试样本，其余作为训练样本
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,)

# 数据标准化
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()  # 初始化特征的标准化器

X_train = ss_X.fit_transform(X_train)  # 分别对训练数据的特征进行标准化处理
# X_test = ss_X.transform(X_test)  #对测试数据进行标准化处理
#交叉验证用于评估模型性能和进行参数调优（模型选择），分类任务中交叉验证缺省是采用StratifiedKFold

loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy',n_jobs=-1)
print('logloss of each fold is: ',loss)
print('cv logloss is:', loss.mean())  #取平均值

#正则化的LlogisticRegression，logistic回归的需要调整超参数有：C（正则系数，一般在log域（取log后的值）均匀设置候选参数）
#和正则函数penalty（L2/L1） 目标函数为：J = sum(logloss(f(xi), yi)) + C* penalty
#在sklearn框架下，不同学习器的参数调整步骤相同： 设置候选参数集合 调用GridSearchCV 调用fit
’‘’
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

#需要调优的参数
# 请尝试将L1正则和L2正则分开，并配合合适的优化求解算法（slover）
#tuned_parameters = {'penalty':['l1','l2'],
#                   'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
#                   }

param_grid = {'penalty':['l1','l2'],
             'C':[0.001,0.01,0.1,1,10,100,200]}

lr_penalty= LogisticRegression(class_weight='balanced',n_jobs=-1,solver='saga')#实例化模型
grid= GridSearchCV(lr_penalty, param_grid,cv=5, scoring='accuracy')  #交叉验证模型参数最优解，GridSearchCV+LogisticRegression的功能
grid.fit(X_train,y_train)  #训练模型--时间很长，1小时起，慎重训练

#查看训练结果评分，如果最佳值在候选参数的边缘，最好再尝试更大的候选参数或更小的候选参数，直到找到拐点。 l2, c=100
grid.cv_results_
print(grid.best_score_)
print(grid.best_params_)

# 制作plot CV误差曲线
test_means = grid.cv_results_[ 'mean_test_score' ]
# test_stds = grid.cv_results_[ 'std_test_score' ]
train_means = grid.cv_results_[ 'mean_train_score' ]
# train_stds = grid.cv_results_[ 'std_train_score' ]


# plot results
n_Cs = len(Cs)
number_penaltys = len(penaltys)
test_scores = np.array(test_means).reshape(n_Cs,number_penaltys)
train_scores = np.array(train_means).reshape(n_Cs,number_penaltys)
# test_stds = np.array(test_stds).reshape(n_Cs,number_penaltys)
# train_stds = np.array(train_stds).reshape(n_Cs,number_penaltys)

x_axis = np.log10(Cs)
for i, value in enumerate(penaltys):
    #pyplot.plot(log(Cs), test_scores[i], label= 'penalty:'   + str(value))
    pyplot.errorbar(x_axis, test_scores[:,i], yerr=test_stds[:,i] ,label = penaltys[i] +' Test')
    pyplot.errorbar(x_axis, train_scores[:,i], yerr=train_stds[:,i] ,label = penaltys[i] +' Train')
    
pyplot.legend()
pyplot.xlabel( 'log(C)' )                                                                                                      
pyplot.ylabel( 'loss' )
pyplot.savefig('LogisticGridSearchCV_C.png' )

pyplot.show()
#由图可知L1正则和L2正则下、不同正则参数C对应的模型在训练集上测试集上的正确率（score）。可以看出在训练集上C越大（正则越少）的模型性能越好；
#但在测试集上当C=100时性能最好（L1正则和L2正则均是）

#L1正则
Cs = [1, 10,100,1000]
# 大量样本（6W+）、高维度（93），L1正则 --> 可选用saga优化求解器(0.19版本新功能)
# LogisticRegressionCV比GridSearchCV快
lrcv_L1 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='accuracy', penalty='l1', solver='liblinear', multi_class='ovr')
lrcv_L1.fit(X_train, y_train)    

lrcv_L1.scores_

#Score_：以类为键的dict，以及在交叉验证每个折叠时获得的分数网格值，
＃每个dict值都有形状（n_folds，len（Cs））
n_Cs = len(Cs)
n_classes = 9
scores =  np.zeros((n_classes,n_Cs))

for j in range(n_classes):
        scores[j][:] = np.mean(lrcv_L1.scores_[j],axis = 0)
    
mse_mean = np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) 
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('accuracy')
pyplot.show()

print ('C is:',lr_cv.C_)  #对多类分类问题，每个类别的分类器有一个C

lrcv_L1.coef_  #由结果可知，惩罚不够，没有稀疏系数

‘’‘L2正则’‘’
from sklearn.linear_model import LogisticRegressionCV

Cs = [1, 10,100,1000]
# 大量样本（6W+）、高维度（93），L2正则 --> 缺省用lbfgs，为了和GridSeachCV比较，也用liblinear
lr_cv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='accuracy', penalty='l2', solver='liblinear', multi_class='ovr')
lr_cv_L2.fit(X_train, y_train)   

lr_cv.scores_

#dict以类作为键，值作为在每个折叠交叉验证期间获得的分数网格，
＃每个dict值都有形状（n_folds，len（Cs））
n_Cs = len(Cs)
n_classes = 9
scores =  np.zeros((n_classes,n_Cs))

for j in range(n_classes):
        scores[j][:] = np.mean(lr_cv.scores_[j],axis = 0)
    
mse_mean = np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) 
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('accuracy')
pyplot.show()

#print ('C is:',lr_cv.C_)  #对多类分类问题，每个类别的分类器有一个C

贝叶斯算法可能需要背===============

二分类，多分类

损失函数：使用对数损失

优化算法：梯度下降==随机梯度下降==mini-bache随机梯度

交叉熵:用来表示两个概率分布距离(差异)

熵：信息的加权平均，log(a<1)<0

信息：必然发生，信息量很少，发生的概率越大，信息量约小，概率越小信息量越大(比较震惊)

通过softmax构建损失函数

类平衡

工业界一般新增数据量锻炼模型

样本采集：采集，采样，造数据

采集和采样