Python 实现LogisticRegression小Demo

1.  加载数据(以csv格式,从本地文件导入)

# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import sys

print("载入数据")
dataset = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
X = dataset[['feature1', 'feature2', 'feature3', 'feature4', 'feature5']].values
y = dataset['label'].values

2. 训练集测试集划分与数据归一化

使用训练集训练一个scaler,然后对测试集进行转化

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

print(sc.scale_)
print(sc.mean_)     # 均值 
print(sc.var_)      # 方差
print(sc.n_samples_seen_)  # 评估样本数
# for i in range(len(tt)):
#    y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-sc.mean_)/sc.scale_).reshape(1, -1),classifier.coef_ .reshape(-1,1))+classifier.intercept_)))
# y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-mean)/scale),Coefficients)+intercept)))

3.  模型训练

from sklearn.linear_model import LogisticRegression

# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state=0)
start = datetime.datetime.now()
lr.fit(X_train, y_train)
end = datetime.datetime.now()
execution_time = end - start
print("训练时间:"+str(execution_time))
print("Coefficients:%s, intercept %s" % (lr.coef_, lr.intercept_))

超参数调优&网格搜索

from sklearn.model_selection import GridSearchCV

lr = LogisticRegression(C=10, penalty='l1', solver='liblinear')

penalty_list = ['l1', 'l2']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# solvers = ['liblinear', 'newton-cg', 'lbfgs']
tuned_parameters = dict(penalty=penalty_list, C=C_list)
grid = GridSearchCV(lr, tuned_parameters, cv=5, scoring='neg_log_loss')
grid.fit(X_train, y_train)
print(grid.best_params_)
# lr = grid.best_estimator_

4. 模型预测与结果处理

4.1   predict_probas() 模型预测概率

# 模型预测概率结果
# predict_probas() 输出有两列,T[1]取第一列(正样本列)
y_pred_prob = lr.predict_proba(X_test).T[1]
# y_pred_prob = lr.predict_proba(X_test)[:,1]
result = pd.DataFrame()
result['lable'] = y_test           # 真实label
result['pred_prob'] = y_pred_prob  # 预测prob
threshold = 0.9

for i in range(0, len(y_pred_prob)):
    if y_pred_prob[i] >= threshold:
       y_pred_prob[i] = 1
    else:
       y_pred_prob[i] = 0


result['lable_pred'] = y_pred_prob  # 预测label
print(result.head())
# 对预测结果进行排序
result = result.sort_values(by=['uid', 'pred_prob'], ascending=[True, False])
# 写出到文件
result.to_csv("./output.csv", sep='\t', index=None, encoding='gbk')

4.2   predict() 模型预测标签(默认阈值0.5)

# Predicting the Test set results
y_pred = lr.predict(X_test)

# 预测损失函数
y_test = y_test.astype(np.float64)
print('log_loss', metrics.log_loss(y_test, y_pred))

5.  模型评测

from sklearn import metrics

print('测试集混淆矩阵: ', metrics.confusion_matrix(y_test, y_pred))
print('准确率Accuracy(Test): %.4g' % metrics.accuracy_score(y_test, y_pred))
print('AUC Score: %f' % metrics.roc_auc_score(y_true=y_test, y_score=y_pred))
print('精确率Precision:', metrics.precision_score(y_test,y_pred,average='weighted'))
print('召回率Recall:', metrics.recall_score(y_test, y_pred))

print('分类结论: \n', metrics.classification_report(y_test, y_pred))

6. smote 采样&随机下采样

   6.1  SMOTE采样

from imblearn.over_sampling import SMOTE

smote_enn = SMOTE(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(X_train, y_train)
print('resampled样本量:{}'.format(X_resampled.shape[0]))

   6.2  RandomUnderSampler采样

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
print('样本量:{}'.format(X_train.shape[0]))
print('resampled样本量:{}'.format(X_resampled.shape[0]))

   6.3  样本分布

from collections import Counter

print('*' * 10, '正负样本分布:', sorted(Counter(y_train).items()))
print('*' * 10, 'resampled正负样本分布:', sorted(Counter(y_resampled).items()))
lr.fit(X_resampled, y_resampled)

7. 特征处理

7.1 one_hot编码

df_dummies = pd.get_dummies(df['provinceid'])
data = pd.concat([df, df_dummies], axis=1)

评论 2
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值