Python 实现LogisticRegression小Demo

最新推荐文章于 2023-09-19 11:58:56 发布

原创最新推荐文章于 2023-09-19 11:58:56 发布 · 615 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习 #python

机器学习专栏收录该内容

38 篇文章

订阅专栏

1. 加载数据（以csv格式，从本地文件导入）

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import sys

print("载入数据")
dataset = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
X = dataset[['feature1', 'feature2', 'feature3', 'feature4', 'feature5']].values
y = dataset['label'].values

2. 训练集测试集划分与数据归一化

使用训练集训练一个scaler,然后对测试集进行转化

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(sc.scale_)
print(sc.mean_)     # 均值 
print(sc.var_)      # 方差
print(sc.n_samples_seen_)  # 评估样本数
# for i in range(len(tt)):
#    y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-sc.mean_)/sc.scale_).reshape(1, -1),classifier.coef_ .reshape(-1,1))+classifier.intercept_)))
# y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-mean)/scale),Coefficients)+intercept)))

3. 模型训练

from sklearn.linear_model import LogisticRegression

# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state=0)
start = datetime.datetime.now()
lr.fit(X_train, y_train)
end = datetime.datetime.now()
execution_time = end - start
print("训练时间："+str(execution_time))
print("Coefficients:%s, intercept %s" % (lr.coef_, lr.intercept_))

超参数调优&网格搜索

from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(C=10, penalty='l1', solver='liblinear')

penalty_list = ['l1', 'l2']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# solvers = ['liblinear', 'newton-cg', 'lbfgs']
tuned_parameters = dict(penalty=penalty_list, C=C_list)
grid = GridSearchCV(lr, tuned_parameters, cv=5, scoring='neg_log_loss')
grid.fit(X_train, y_train)
print(grid.best_params_)
# lr = grid.best_estimator_

4. 模型预测与结果处理

4.1 predict_probas() 模型预测概率

# 模型预测概率结果
# predict_probas() 输出有两列，T[1]取第一列（正样本列）
y_pred_prob = lr.predict_proba(X_test).T[1]
# y_pred_prob = lr.predict_proba(X_test)[:,1]
result = pd.DataFrame()
result['lable'] = y_test           # 真实label
result['pred_prob'] = y_pred_prob  # 预测prob
threshold = 0.9

for i in range(0, len(y_pred_prob)):
    if y_pred_prob[i] >= threshold:
       y_pred_prob[i] = 1
    else:
       y_pred_prob[i] = 0


result['lable_pred'] = y_pred_prob  # 预测label
print(result.head())
# 对预测结果进行排序
result = result.sort_values(by=['uid', 'pred_prob'], ascending=[True, False])
# 写出到文件
result.to_csv("./output.csv", sep='\t', index=None, encoding='gbk')

4.2 predict() 模型预测标签（默认阈值0.5）

# Predicting the Test set results
y_pred = lr.predict(X_test)

# 预测损失函数
y_test = y_test.astype(np.float64)
print('log_loss', metrics.log_loss(y_test, y_pred))

5. 模型评测

from sklearn import metrics

print('测试集混淆矩阵: ', metrics.confusion_matrix(y_test, y_pred))
print('准确率Accuracy(Test): %.4g' % metrics.accuracy_score(y_test, y_pred))
print('AUC Score: %f' % metrics.roc_auc_score(y_true=y_test, y_score=y_pred))
print('精确率Precision:', metrics.precision_score(y_test,y_pred,average='weighted'))
print('召回率Recall:', metrics.recall_score(y_test, y_pred))

print('分类结论: \n', metrics.classification_report(y_test, y_pred))

6. smote 采样&随机下采样

6.1 SMOTE采样

from imblearn.over_sampling import SMOTE

smote_enn = SMOTE(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(X_train, y_train)
print('resampled样本量：{}'.format(X_resampled.shape[0]))

6.2 RandomUnderSampler采样

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
print('样本量：{}'.format(X_train.shape[0]))
print('resampled样本量：{}'.format(X_resampled.shape[0]))

6.3 样本分布

from collections import Counter

print('*' * 10, '正负样本分布:', sorted(Counter(y_train).items()))
print('*' * 10, 'resampled正负样本分布:', sorted(Counter(y_resampled).items()))
lr.fit(X_resampled, y_resampled)

7. 特征处理

7.1 one_hot编码

df_dummies = pd.get_dummies(df['provinceid'])
data = pd.concat([df, df_dummies], axis=1)

Python 实现LogisticRegression小Demo

2 条评论