基于Kaggle电信用户流失案例数据(可在官网进行下载)
导入库
# 基础数据科学运算库
import numpy as np
import pandas as pd
# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt
# 时间模块
import time
import warnings
warnings.filterwarnings('ignore')
# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
# 实用函数
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# 网格搜索
from sklearn.model_selection import GridSearchCV
# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin
# re模块相关
import inspect, re
# 其他模块
from tqdm import tqdm
import gc
数据预处理模块
然后进行数据清洗相关工作:
# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
# 标注连续/离散字段
# 离散字段
category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod']
# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
# 标签
target = 'Churn'
# ID列
ID_col = 'customerID'
# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]
# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)
# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)
# 标签值手动转化
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No', value=0, inplace=True)
features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()
时序特征衍生
# 定义辅助函数
def colName(ColumnTransformer, numeric_cols, category_cols):
col_name = []
col_value = ColumnTransformer.named_transformers_['cat'].categories_
for i, j in enumerate(category_cols):
if len(col_value[i]) == 2:
col_name.append(j)
else:
for f in col_value[i]:
feature_name = j