机器学习网格搜索超参数优化实战(随机森林) ##4

最新推荐文章于 2025-08-20 20:33:51 发布

原创

最新推荐文章于 2025-08-20 20:33:51 发布 · 1.9k 阅读

30 ·

CC 4.0 BY-SA版权

文章标签：

#机器学习 #随机森林 #人工智能

文章目录

基于Kaggle电信用户流失案例数据（可在官网进行下载）

基于Kaggle电信用户流失案例数据（可在官网进行下载）

导入库

# 基础数据科学运算库
import numpy as np
import pandas as pd

# 可视化库
import seaborn as sns
import matplotlib.pyplot as plt

# 时间模块
import time

import warnings
warnings.filterwarnings('ignore')

# sklearn库
# 数据预处理
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# 实用函数
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

# 常用评估器
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 网格搜索
from sklearn.model_selection import GridSearchCV

# 自定义评估器支持模块
from sklearn.base import BaseEstimator, TransformerMixin

# re模块相关
import inspect, re

# 其他模块
from tqdm import tqdm
import gc

数据预处理模块

然后进行数据清洗相关工作：

# 读取数据
tcc = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# 标注连续/离散字段
# 离散字段
category_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 
                'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
                'PaymentMethod']

# 连续字段
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
 
# 标签
target = 'Churn'

# ID列
ID_col = 'customerID'

# 验证是否划分能完全
assert len(category_cols) + len(numeric_cols) + 2 == tcc.shape[1]

# 连续字段转化
tcc['TotalCharges']= tcc['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)
tcc['MonthlyCharges'] = tcc['MonthlyCharges'].astype(float)

# 缺失值填补
tcc['TotalCharges'] = tcc['TotalCharges'].fillna(0)

# 标签值手动转化 
tcc['Churn'].replace(to_replace='Yes', value=1, inplace=True)
tcc['Churn'].replace(to_replace='No',  value=0, inplace=True)

features = tcc.drop(columns=[ID_col, target]).copy()
labels = tcc['Churn'].copy()

时序特征衍生

# 定义辅助函数
def colName(ColumnTransformer, numeric_cols, category_cols):
    col_name = []
    col_value = ColumnTransformer.named_transformers_['cat'].categories_
    
    for i, j in enumerate(category_cols):
        if len(col_value[i]) == 2:
            col_name.append(j)
        else:
            for f in col_value[i]:
                feature_name = j