Day13 不平衡数据集的处理

最新推荐文章于 2025-12-16 10:44:31 发布

原创最新推荐文章于 2025-12-16 10:44:31 发布 · 735 阅读

18 ·

CC 4.0 BY-SA版权

文章标签：

#python

python打卡专栏收录该内容

43 篇文章

订阅专栏

@浙大疏锦行

不平衡数据集的处理策略：过采样、修改权重、修改阈值
交叉验证代码

之前的数据预处理代码

from pickle import TRUE
import pandas as pd    #用于数据处理和分析，可处理表格数据。
import numpy as np     #用于数值计算，提供了高效的数组操作。
import matplotlib.pyplot as plt    #用于绘制各种类型的图表
import seaborn as sns   #基于matplotlib的高级绘图库，能绘制更美观的统计图形。
from sklearn.preprocessing import StandardScaler
# 设置中文字体（解决中文显示问题）
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False    # 正常显示负号

data = pd.read_csv('heart.csv')    #读取数据
print("数据基本信息：")
print(data.info())
print("\n数据前5行预览:")
print(data.head())

discrete_feature=[]
continue_feature=[]
# 定义标签列名称
TARGET_COLUMN = 'target'
for feature in data.columns:
    # 跳过标签列
    if feature == TARGET_COLUMN:
        continue
    
    if data[feature].dtype =='object':
        discrete_feature.append(feature)
    else:
        unique_count = data[feature].nunique()
        if unique_count<=5:
            discrete_feature.append(feature)
        else:
            continue_feature.append(feature)

print("离散特征：", discrete_feature)
print("连续特征：", continue_feature)
# 新增：单独提取标签列
target = data[TARGET_COLUMN]
print("标签列名称：", TARGET_COLUMN)
print("标签分布：", target.value_counts())

# 数值特征异常值处理
# 新增：异常值检测与处理（IQR方法）
# 1. 可视化异常值（箱线图）
plt.figure(figsize=(15, 5))
for i, feature in enumerate(continue_feature):
    plt.subplot(1, len(continue_feature), i+1)
    sns.boxplot(x=data[feature])
    plt.title(f'{feature} 异常值检测')
plt.tight_layout()
# plt.show()

# 2. IQR方法处理异常值
# 创建副本避免修改原数据
data_clean = data.copy()
for feature in continue_feature:
    Q1 = data_clean[feature].quantile(0.25)
    Q3 = data_clean[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # 截断异常值（医学数据推荐，避免数据丢失）
    data_clean[feature] = data_clean[feature].clip(lower_bound, upper_bound)
    
    # 统计异常值数量
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    print(f'{feature}: 检测到{len(outliers)}个异常值，已截断处理')

# 分类特征编码（独热编码）
data_encoded = pd.get_dummies(data_clean, columns=discrete_feature, drop_first=True)
print("\n独热编码后的特征数量:", data_encoded.shape[1])
data_encoded = data_encoded.astype(int)

# 数值特征标准化
scaler = StandardScaler()
data_encoded[continue_feature] = scaler.fit_transform(data_encoded[continue_feature])


# 新增：特征工程（医学领域衍生特征）
# 1. 创建生理指标比率特征
data_encoded['thalach_age_ratio'] = data_encoded['thalach'] / data_encoded['age']  # 心率年龄比

data_encoded['bp_chol_risk'] = data_encoded['trestbps'] * data_encoded['chol']  # 血压胆固醇乘积风险指数

data_encoded['st_depression_ratio'] = data_encoded['oldpeak'] / data_encoded['thalach'].replace(0, 0.1)  # 避免除零

# 2. 更新特征列表（添加新特征到连续特征集）
new_features = ['thalach_age_ratio', 'bp_chol_risk', 'st_depression_ratio']
continue_feature.extend(new_features)
print("新增衍生特征后连续特征集:", continue_feature)

# 新增：共线性处理（医学数据关键步骤）
correlation_threshold = 0.8 
corr_matrix = data_encoded[continue_feature].corr().abs() 
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) 
to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)] 

if to_drop: 
    data_encoded = data_encoded.drop(to_drop, axis=1) 
    print(f'移除高共线性特征: {to_drop}') 
    # 更新特征列表 
    continue_feature = [f for f in continue_feature if f not in to_drop]

# 查看处理后的数据
print("\n预处理后的数据预览:")
print(data_encoded.head())

# 数据集的划分
from sklearn.model_selection import train_test_split
# 准备标签和特征
X = data_encoded.drop(TARGET_COLUMN, axis=1)
y = data_encoded[TARGET_COLUMN]
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

基准模型

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical
import time
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score# 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
warnings.filterwarnings("ignore")

# --- 1. 默认参数逻辑回归（基准模型）---
print("--- 1. 默认参数逻辑回归 (训练集 -> 测试集) ---")
start_time = time.time()
# 初始化模型：默认无正则化(C=1.0)、l2正则化、liblinear优化器
base_model = LogisticRegression(random_state=42)
# 在训练集上拟合模型
base_model.fit(X_train, y_train)
# 在测试集上预测
base_pred = base_model.predict(X_test)
end_time = time.time()

print(f"训练预测耗时: {end_time - start_time:.4f} 秒")
print("默认参数逻辑回归分类报告：")
print(classification_report(y_test, base_pred))
print("默认参数逻辑回归混淆矩阵：")
print(confusion_matrix(y_test, base_pred))

--- 1. 默认参数逻辑回归 (训练集 -> 测试集) ---
训练预测耗时: 0.0128 秒
默认参数逻辑回归分类报告：
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

默认参数逻辑回归混淆矩阵：
[[27  2]
 [ 4 28]]

不平衡数据集指的是数据集中不同类别的样本数量相差很大（比如1000个正常样本，只有10个患病样本）。这种情况下，模型很容易偏向多数类，导致少数类预测效果差。

以下是常见的处理策略：

1. 数据层面

过采样（让少数类变多）：简单复制少数类样本（容易过拟合）

SMOTE（人工合成少数类样本）：找少数类样本的邻居，在它们之间"插值"生成新样本

ADASYN（根据难度动态生成样本）：难分类的少数类样本生成更多新样本

欠采样（让多数类变少）：随机删除多数类样本（可能丢失信息）

聚类欠采样：对多数类聚类，只保留每个簇的代表样本

混合采样：先欠采样多数类，再过采样少数类（如SMOTE+ENN）

2. 算法层面

调整类别权重：告诉模型少数类更重要（如逻辑回归的class_weight参数）

改变评估指标：不用准确率，改用对不平衡数据更友好的指标：

精确率（查准率）：预测为正类的样本中，真正是正类的比例

召回率（查全率）：所有正类样本中，被正确预测的比例

F1分数：精确率和召回率的调和平均

AUC：反映模型区分正负类的能力

集成方法：

Bagging：用不同的多数类子集与少数类组合，训练多个模型投票

Boosting：重点学习难分类的样本

3. 模型层面

选择对不平衡数据更鲁棒的模型（如决策树、随机森林），而不是线性模型

三种优化


# 1. 调整类别权重的逻辑回归
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import classification_report, confusion_matrix

# 初始化模型：设置class_weight='balanced'自动调整权重
balanced_model = LogisticRegression(random_state=42, class_weight='balanced')
start_time_balanced = time.time()
balanced_model.fit(X_train, y_train)
end_time_balanced = time.time()

print(f"调整类别权重的逻辑回归 训练耗时: {end_time_balanced - start_time_balanced:.4f} 秒")

# 在测试集上预测
balanced_pred = balanced_model.predict(X_test)

print("\n调整类别权重的逻辑回归 在测试集上的分类报告：")
print(classification_report(y_test, balanced_pred))
print("调整类别权重的逻辑回归 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, balanced_pred))

# 2. SMOTE过采样的逻辑回归
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
import time
from sklearn.metrics import classification_report, confusion_matrix

# 使用SMOTE进行过采样
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE过采样后训练集的形状：", X_train_smote.shape, y_train_smote.shape)

# 训练逻辑回归模型（使用SMOTE过采样后的训练集）
logreg_model_smote = LogisticRegression(random_state=42)
start_time_smote = time.time()
logreg_model_smote.fit(X_train_smote, y_train_smote)
end_time_smote = time.time()

print(f"SMOTE过采样后训练耗时: {end_time_smote - start_time_smote:.4f} 秒")

# 在测试集上预测
logreg_pred_smote = logreg_model_smote.predict(X_test)

print("\nSMOTE过采样后逻辑回归 在测试集上的分类报告：")
print(classification_report(y_test, logreg_pred_smote))
print("SMOTE过采样后逻辑回归 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, logreg_pred_smote))


# 3. SMOTE过采样结合交叉验证的逻辑回归
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import classification_report, confusion_matrix

# 创建包含SMOTE和逻辑回归的Pipeline
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('model', LogisticRegression(random_state=42))
])

# 进行5折交叉验证
print("\n--- SMOTE+交叉验证结果 ---")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='roc_auc')
print(f"交叉验证AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# 在训练集上拟合完整模型
start_time_pipeline = time.time()
pipeline.fit(X_train, y_train)
end_time_pipeline = time.time()

print(f"SMOTE+交叉验证模型 训练耗时: {end_time_pipeline - start_time_pipeline:.4f} 秒")

# 在测试集上预测
pipeline_pred = pipeline.predict(X_test)

print("\nSMOTE+交叉验证逻辑回归 在测试集上的分类报告：")
print(classification_report(y_test, pipeline_pred))
print("SMOTE+交叉验证逻辑回归 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, pipeline_pred))

调整类别权重的逻辑回归 训练耗时: 0.0079 秒

调整类别权重的逻辑回归 在测试集上的分类报告：
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

调整类别权重的逻辑回归 在测试集上的混淆矩阵：
[[27  2]
 [ 4 28]]
SMOTE过采样后训练集的形状： (266, 25) (266,)
SMOTE过采样后训练耗时: 0.0130 秒

SMOTE过采样后逻辑回归 在测试集上的分类报告：
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

SMOTE过采样后逻辑回归 在测试集上的混淆矩阵：
[[27  2]
 [ 4 28]]

--- SMOTE+交叉验证结果 ---
交叉验证AUC: 0.8956 ± 0.0250
SMOTE+交叉验证模型 训练耗时: 0.0228 秒

SMOTE+交叉验证逻辑回归 在测试集上的分类报告：
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

SMOTE+交叉验证逻辑回归 在测试集上的混淆矩阵：
[[27  2]
 [ 4 28]]

发现调整了也并没有变得更好

# 4. 调整类别权重结合交叉验证的逻辑回归
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import time
from sklearn.metrics import classification_report, confusion_matrix

# 初始化模型：设置class_weight='balanced'自动调整权重
balanced_cv_model = LogisticRegression(random_state=42, class_weight='balanced')

# 进行5折交叉验证
print("\n--- 调整类别权重+交叉验证结果 ---")
cv_scores_balanced = cross_val_score(balanced_cv_model, X_train, y_train, cv=5, scoring='roc_auc')
print(f"交叉验证AUC: {cv_scores_balanced.mean():.4f} ± {cv_scores_balanced.std():.4f}")

# 在训练集上拟合完整模型
start_time_balanced_cv = time.time()
balanced_cv_model.fit(X_train, y_train)
end_time_balanced_cv = time.time()

print(f"调整类别权重+交叉验证模型 训练耗时: {end_time_balanced_cv - start_time_balanced_cv:.4f} 秒")

# 在测试集上预测
balanced_cv_pred = balanced_cv_model.predict(X_test)

print("\n调整类别权重+交叉验证逻辑回归 在测试集上的分类报告：")
print(classification_report(y_test, balanced_cv_pred))
print("调整类别权重+交叉验证逻辑回归 在测试集上的混淆矩阵：")
print(confusion_matrix(y_test, balanced_cv_pred))

--- 调整类别权重+交叉验证结果 ---
交叉验证AUC: 0.8984 ± 0.0215
调整类别权重+交叉验证模型 训练耗时: 0.0063 秒

调整类别权重+交叉验证逻辑回归 在测试集上的分类报告：
              precision    recall  f1-score   support

           0       0.87      0.93      0.90        29
           1       0.93      0.88      0.90        32

    accuracy                           0.90        61
   macro avg       0.90      0.90      0.90        61
weighted avg       0.90      0.90      0.90        61

调整类别权重+交叉验证逻辑回归 在测试集上的混淆矩阵：
[[27  2]
 [ 4 28]]

实践建议

1. 评估指标先行： 明确你的目标，使用适合不平衡数据的指标（Recall, F1-Score, AUC-PR, Balanced Accuracy, MCC）来评估模型。

2. 优先尝试根本方法：通常建议首先尝试**修改权重 (`class_weight='balanced'`)或数据采样方法 (如 SMOTE)，因为它们试图从源头改善模型学习。

3. 交叉验证评估：在使用 `class_weight` 或采样方法时，务必使用分层交叉验证 (Stratified K-Fold) 来获得对模型性能的可靠估计。

4. 阈值调整作为补充：修改阈值可以作为一种补充手段或最后的微调。即使使用了权重调整，有时仍需根据具体的业务需求（如必须达到某个召回率水平）来调整阈值，找到最佳的操作点。

5. 组合策略：有时结合多种方法（如 SMOTE + `class_weight`）可能会产生更好的结果。

总之，修改权重旨在训练一个“更好”的模型，而修改阈值是在一个“已有”模型上调整其表现。理解它们的差异有助于你选择更合适的策略来应对不平衡数据集的挑战。