实验目的
1.获取数据并对数据进行预处理
2.构建重采样函数
3.模型训练
4.预测并生成结果
实验要求
1.掌握python中数据预处理的基本方法
2.掌握logistic回归模型
3.掌握随机森林分类模型
4.掌握xgboost模型
实验原理
对银行贷款客户的信息数据进行一系列预处理过后,分别建立逻辑回归模型、随机森林分类模型、xgboost模型对模型进行分析,然后以xgboost模型对测试集客户的违约概率进行预测,通过借款人的违约概率将客户划分为A、B、C、D、E、F、G七个等级。
import imblearn
print(imblearn.__version__)
import xgboost as xgb
print(xgb.__version__)
#数据预处理流程
#1. 数据加载与初步查看
import pandas as pd
import numpy as np
# 加载数据(假设数据文件名为loan_data.csv,包含标签列'y'表示是否违约)
data = pd.read_csv('cs-training.csv')
print("数据基本信息:")
print(data.head()) # 查看前5行数据
print("\n数据缺失值情况:")
print(data.isnull().sum()) # 统计各列缺失值数量
print("\n数据类型分布:")
print(data.dtypes) # 查看各列数据类型
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
# 划分特征和标签
X = data.drop(["SeriousDlqin2yrs", "Unnamed: 0"], axis=1) # 移除ID列和目标变量
y = data["SeriousDlqin2yrs"]
# 特征工程:数值特征预处理
numerical_cols = X.columns.tolist()
preprocessor = ColumnTransformer(
transformers=[
("num",
StandardScaler(), # 标准化数值特征
numerical_cols)
],
remainder="passthrough" # 保留未明确指定的列
)
# 使用中位数填充缺失值 + 标准化
X_processed = preprocessor.fit_transform(X)
# 划分训练集和测试集(保持类别平衡)
X_train, X_test, y_train, y_test = train_test_split(
X_processed, y, test_size=0.2, random_state=42, stratify=y
)
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
def resample_data(X, y, method="smote"):
"""
重采样函数:支持SMOTE过采样和随机欠采样
输入:
X: 特征矩阵
y: 标签
method: 重采样方法,默认为SMOTE
输出:
重采样后的特征和标签
"""
# 处理缺失值
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# 处理无穷大值
X_imputed = np.nan_to_num(X_imputed, nan=np.nan, posinf=np.nan, neginf=np.nan)
X_imputed = imputer.transform(X_imputed) # 再次填充可能产生的NaN
# 标准化(可选但推荐)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
# 重采样
if method == "smote":
sampler = SMOTE(random_state=42)
else:
# 可添加欠采样逻辑
pass
X_resampled, y_resampled = sampler.fit_resample(X_scaled, y)
return X_resampled, y_resampled
X_train_resampled, y_train_resampled = resample_data(X_train, y_train)
# 1. 初始化预处理工具(使用训练数据拟合)
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
# 2. 对训练数据进行预处理
X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)
# 3. 对测试数据进行预处理(使用训练数据的统计信息)
X_test_imputed = imputer.transform(X_test) # 只transform,不fit
X_test_scaled = scaler.transform(X_test_imputed) # 只transform,不fit
# 4. 重采样(如果需要)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
# 5. 模型训练
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_resampled, y_train_resampled)
# 6. 评估(使用预处理后的测试数据)
y_pred_prob_logreg = logreg_model.predict_proba(X_test_scaled)[:, 1]
print("逻辑回归 AUC:", roc_auc_score(y_test, y_pred_prob_logreg))#roc曲线下面积
# 1. 初始化预处理工具(使用训练数据拟合)
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
# 2. 对训练数据进行预处理
X_train_imputed = imputer.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train_imputed)
# 3. 重采样(如果需要)
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
# 4. 模型训练
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
# 5. 对测试数据进行预处理(关键修复点)
X_test_imputed = imputer.transform(X_test) # 使用训练数据的统计信息填充缺失值
X_test_scaled = scaler.transform(X_test_imputed) # 使用训练数据的均值和标准差标准化
# 6. 检查测试数据是否包含无效值(关键验证步骤)
if np.isnan(X_test_scaled).any() or np.isinf(X_test_scaled).any():
print("测试数据仍包含 NaN 或无穷大值,需要进一步处理")
# 可选:替换为合理值
X_test_scaled = np.nan_to_num(X_test_scaled, nan=0.0, posinf=1e10, neginf=-1e10)
# 7. 评估模型
y_pred_prob_rf = rf_model.predict_proba(X_test_scaled)[:, 1]
print("随机森林 AUC:", roc_auc_score(y_test, y_pred_prob_rf))
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import xgboost as xgb
# 加载数据
data = pd.read_csv("cs-training.csv")
# 数据预处理
# 移除无用列和目标变量
X = data.drop(["SeriousDlqin2yrs", "Unnamed: 0"], axis=1)
y = data["SeriousDlqin2yrs"]
# 定义数值特征列
numerical_cols = X.columns.tolist()
# 创建预处理管道
preprocessor = ColumnTransformer(
transformers=[
("num",
StandardScaler(), # 标准化特征
numerical_cols)
],
remainder="passthrough"
)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 对训练数据进行预处理
X_train_processed = preprocessor.fit_transform(X_train)
# 处理缺失值
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train_processed)
# 训练XGBoost模型
xgb_model = xgb.XGBClassifier(
objective="binary:logistic",
n_estimators=100,
random_state=42
)
xgb_model.fit(X_train_imputed, y_train)
# 对测试数据进行预处理(关键步骤)
X_test_processed = preprocessor.transform(X_test) # 使用训练集的预处理参数
X_test_imputed = imputer.transform(X_test_processed) # 使用训练集的填充参数
# 预测违约概率
predicted_probs = xgb_model.predict_proba(X_test_imputed)[:, 1]
print("XGBoost AUC:", roc_auc_score(y_test, predicted_probs))
# 将测试数据转回DataFrame格式(保留原始索引和列名)
test_data = pd.DataFrame(X_test, columns=X.columns)
# 添加预测结果
test_data["default_probability"] = predicted_probs
# 保存结果
test_data.to_csv("loan_predictions.csv", index=False)
print("预测结果已保存到 loan_predictions.csv")
# 方法1:增加一个等级标签
thresholds = [0.05, 0.15, 0.3, 0.45, 0.6, 0.75, 0.9]
labels = ["A", "B", "C", "D", "E", "F", "G", "H"] # 增加"H"
test_data["grade"] = pd.cut(
test_data["default_probability"],
bins=[0] + thresholds + [1],
labels=labels,
right=False
)
# 查看等级分布
print(test_data["grade"].value_counts())
# 检查每个等级的概率范围
for grade in labels:
grade_data = test_data[test_data["grade"] == grade]
print(f"等级 {grade}: 概率范围 [{grade_data['default_probability'].min():.4f}, {grade_data['default_probability'].max():.4f}]")
644

被折叠的 条评论
为什么被折叠?



