1. 加载数据(以csv格式,从本地文件导入)
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import sys
print("载入数据")
dataset = pd.read_csv('data.tsv', sep='\t').round(decimals=4)
X = dataset[['feature1', 'feature2', 'feature3', 'feature4', 'feature5']].values
y = dataset['label'].values
2. 训练集测试集划分与数据归一化
使用训练集训练一个scaler,然后对测试集进行转化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
print(sc.scale_)
print(sc.mean_) # 均值
print(sc.var_) # 方差
print(sc.n_samples_seen_) # 评估样本数
# for i in range(len(tt)):
# y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-sc.mean_)/sc.scale_).reshape(1, -1),classifier.coef_ .reshape(-1,1))+classifier.intercept_)))
# y_pre = 1.0 / (1.0 + np.exp(-(np.dot(((tt[i]-mean)/scale),Coefficients)+intercept)))
3. 模型训练
from sklearn.linear_model import LogisticRegression
# Fitting Logistic Regression to the Training set
lr = LogisticRegression(random_state=0)
start = datetime.datetime.now()
lr.fit(X_train, y_train)
end = datetime.datetime.now()
execution_time = end - start
print("训练时间:"+str(execution_time))
print("Coefficients:%s, intercept %s" % (lr.coef_, lr.intercept_))
超参数调优&网格搜索
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression(C=10, penalty='l1', solver='liblinear')
penalty_list = ['l1', 'l2']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
# solvers = ['liblinear', 'newton-cg', 'lbfgs']
tuned_parameters = dict(penalty=penalty_list, C=C_list)
grid = GridSearchCV(lr, tuned_parameters, cv=5, scoring='neg_log_loss')
grid.fit(X_train, y_train)
print(grid.best_params_)
# lr = grid.best_estimator_
4. 模型预测与结果处理
4.1 predict_probas() 模型预测概率
# 模型预测概率结果
# predict_probas() 输出有两列,T[1]取第一列(正样本列)
y_pred_prob = lr.predict_proba(X_test).T[1]
# y_pred_prob = lr.predict_proba(X_test)[:,1]
result = pd.DataFrame()
result['lable'] = y_test # 真实label
result['pred_prob'] = y_pred_prob # 预测prob
threshold = 0.9
for i in range(0, len(y_pred_prob)):
if y_pred_prob[i] >= threshold:
y_pred_prob[i] = 1
else:
y_pred_prob[i] = 0
result['lable_pred'] = y_pred_prob # 预测label
print(result.head())
# 对预测结果进行排序
result = result.sort_values(by=['uid', 'pred_prob'], ascending=[True, False])
# 写出到文件
result.to_csv("./output.csv", sep='\t', index=None, encoding='gbk')
4.2 predict() 模型预测标签(默认阈值0.5)
# Predicting the Test set results
y_pred = lr.predict(X_test)
# 预测损失函数
y_test = y_test.astype(np.float64)
print('log_loss', metrics.log_loss(y_test, y_pred))
5. 模型评测
from sklearn import metrics
print('测试集混淆矩阵: ', metrics.confusion_matrix(y_test, y_pred))
print('准确率Accuracy(Test): %.4g' % metrics.accuracy_score(y_test, y_pred))
print('AUC Score: %f' % metrics.roc_auc_score(y_true=y_test, y_score=y_pred))
print('精确率Precision:', metrics.precision_score(y_test,y_pred,average='weighted'))
print('召回率Recall:', metrics.recall_score(y_test, y_pred))
print('分类结论: \n', metrics.classification_report(y_test, y_pred))
6. smote 采样&随机下采样
6.1 SMOTE采样
from imblearn.over_sampling import SMOTE
smote_enn = SMOTE(random_state=0)
X_resampled, y_resampled = smote_enn.fit_sample(X_train, y_train)
print('resampled样本量:{}'.format(X_resampled.shape[0]))
6.2 RandomUnderSampler采样
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_sample(X_train, y_train)
print('样本量:{}'.format(X_train.shape[0]))
print('resampled样本量:{}'.format(X_resampled.shape[0]))
6.3 样本分布
from collections import Counter
print('*' * 10, '正负样本分布:', sorted(Counter(y_train).items()))
print('*' * 10, 'resampled正负样本分布:', sorted(Counter(y_resampled).items()))
lr.fit(X_resampled, y_resampled)
7. 特征处理
7.1 one_hot编码
df_dummies = pd.get_dummies(df['provinceid'])
data = pd.concat([df, df_dummies], axis=1)
8582





