import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import re
# XGBoost removed
def make_ohe(**kwargs):
try:
return OneHotEncoder(**kwargs)
except TypeError:
# older/newer sklearn may use sparse_output instead of sparse
if 'sparse' in kwargs:
val = kwargs.pop('sparse')
kwargs['sparse_output'] = val
return OneHotEncoder(**kwargs)
# ---------- 1. 读取数据 ----------
df_train = pd.read_csv('训练数据集.csv')
df_test = pd.read_csv('测试集.csv')
id_col = 'id'
target_col = 'target'
# 明确三类特征(根据用户说明)
# 连续数值型(需要对数/缩尾):
cont_vars = ['amount', 'income', 'total_balance', 'credict_used_amount', 'credict_limit',
'last_overdue_months', 'recent_account_months', 'last_credict_card_months']
# 离散数值型(计数/次数,保持或标识0):
disc_vars = ['overdue_times', 'default_times', 'total_default_number', 'inquire_times',
'closed_credict_cards', 'mortage_number', 'account_number', 'loan_history',
'recent_loan_number', 'half_used_credict_card', 'total_credict_card_number']
# 类别型:
cat_vars = ['housing', 'purpose']
# 确保存在性(若训练集中不存在某些变量,则从实际列里推断)
all_cols = list(df_train.columns)
if target_col in all_cols:
all_cols.remove(target_col)
if id_col in all_cols:
all_cols.remove(id_col)
def safe_intersect(lst):
return [c for c in lst if c in df_train.columns]
cont_vars = safe_intersect(cont_vars)
disc_vars = safe_intersect(disc_vars)
cat_vars = safe_intersect(cat_vars)
# ---------- 2. 基于训练集计算变换参数并应用到数据的 Transformer ----------
class TrainBasedPreprocessor(BaseEstimator, TransformerMixin):
def __init__(self, cont_vars, disc_vars, cat_vars, winsor_q=(0.01, 0.99)):
self.cont_vars = cont_vars
self.disc_vars = disc_vars
self.cat_vars = cat_vars
self.winsor_q = winsor_q
# placeholders
self.winsor_bounds_ = {}
self.scaler_ = None
self.ohe_ = None
self.binner_ = {}
def fit(self, X, y=None):
df = X.copy()
# 缺失处理:数值用中位数,类别用众数
for c in self.cont_vars + self.disc_vars:
if c in df.columns:
df[c] = df[c].fillna(df[c].median())
for c in self.cat_vars:
if c in df.columns:
df[c] = df[c].fillna(df[c].mode().iloc[0])
# winsor bounds 根据训练集计算(用于部分 cont 变量)
for c in self.cont_vars:
if c in df.columns:
lo = df[c].quantile(self.winsor_q[0])
hi = df[c].quantile(self.winsor_q[1])
self.winsor_bounds_[c] = (lo, hi)
# 对连续变量先做 log(1+x) 变换后的标准化
cont_vals = []
for c in self.cont_vars:
if c in df.columns:
cont_vals.append(np.log1p(df[c].clip(self.winsor_bounds_[c][0], self.winsor_bounds_[c][1])))
if cont_vals:
cont_mat = np.vstack(cont_vals).T
self.scaler_ = StandardScaler().fit(cont_mat)
# 类别编码器
if self.cat_vars:
# fit OneHotEncoder on train categories (safe)
cats = [c for c in self.cat_vars if c in df.columns]
if cats:
self.ohe_ = make_ohe(
handle_unknown='ignore', # 处理未知类别
sparse=False, # 使用密集矩阵以提高效率
drop='if_binary', # 如果是二元类别则删除一个以避免共线性
dtype=np.float32 # 使用float32以节省内存
)
# 对类别数据进行预处理和拟合
cat_df = df[cats].astype(str)
self.ohe_.fit(cat_df)
# 分箱:对 total_credict_card_number 等做等频分箱(5箱)示例
if 'total_credict_card_number' in df.columns:
try:
bins = pd.qcut(df['total_credict_card_number'], 5, duplicates='drop', retbins=True)[1]
self.binner_['total_credict_card_number'] = bins
except Exception:
# 如果不能分箱(值太少),就用原值
self.binner_['total_credict_card_number'] = None
return self
def transform(self, X):
df = X.copy()
# 填充缺失
for c in self.cont_vars + self.disc_vars:
if c in df.columns:
df[c] = df[c].fillna(df[c].median())
for c in self.cat_vars:
if c in df.columns:
df[c] = df[c].fillna(df[c].mode().iloc[0])
# 连续变量:缩尾 -> log1p -> 标准化
cont_trans = []
for c in self.cont_vars:
if c in df.columns:
lo, hi = self.winsor_bounds_.get(c, (df[c].min(), df[c].max()))
v = np.log1p(df[c].clip(lo, hi))
cont_trans.append(v.values)
df[c + '_log'] = v
if cont_trans and self.scaler_ is not None:
cont_mat = np.vstack(cont_trans).T
cont_scaled = self.scaler_.transform(cont_mat)
# 替换原有 log 列为标准化列
for i, c in enumerate(self.cont_vars):
if c in df.columns:
df[c + '_s'] = cont_scaled[:, i]
# 离散变量:保持原值并创建零值标识
for c in self.disc_vars:
if c in df.columns:
df[c] = df[c].fillna(0)
df[c + '_is_zero'] = (df[c] == 0).astype(int)
# 类别变量:OHE(优化版本)
if self.ohe_ is not None:
cats = [c for c in self.cat_vars if c in df.columns]
if cats:
# 转换为字符串并进行独热编码
cat_df = df[cats].astype(str)
ohe_vals = self.ohe_.transform(cat_df)
# 获取特征名称并添加前缀
feature_names = []
for i, col in enumerate(cats):
# 获取当前特征的所有类别
categories = self.ohe_.categories_[i]
# 为每个类别创建特征名称
for cat in categories:
feature_names.append(f'{col}_{cat}')
# 创建独热编码DataFrame
ohe_df = pd.DataFrame(
ohe_vals,
columns=feature_names,
index=df.index
)
# 添加到原始数据框
df = pd.concat([df, ohe_df], axis=1)
# 可选:删除原始类别列以节省内存
df.drop(columns=cats, inplace=True)
# 分箱应用
if 'total_credict_card_number' in df.columns and self.binner_.get('total_credict_card_number') is not None:
bins = self.binner_['total_credict_card_number']
df['total_credict_card_number_bin'] = pd.cut(df['total_credict_card_number'], bins=bins, include_lowest=True).astype(str)
# 最后返回 dataframe(保留 id 列)
return df
# ---------- 3. 运行预处理、训练模型并预测 ----------
# 准备训练数据
drop_cols = [id_col, target_col] if target_col in df_train.columns else [id_col]
features = [c for c in df_train.columns if c not in drop_cols]
X_train_raw = df_train[features]
y_train = df_train[target_col]
X_test_raw = df_test[[c for c in df_test.columns if c != id_col]] if id_col in df_test.columns else df_test.copy()
pre = TrainBasedPreprocessor(cont_vars, disc_vars, cat_vars)
pre.fit(X_train_raw, y_train)
X_train = pre.transform(X_train_raw)
X_test = pre.transform(X_test_raw)
# 为模型选择最终特征:保留标准化后的连续、离散原值及其is_zero、ohe列、分箱列
selected_cols = []
for c in cont_vars:
if c in X_train.columns:
if c + '_s' in X_train.columns:
selected_cols.append(c + '_s')
else:
selected_cols.append(c)
for c in disc_vars:
if c in X_train.columns:
selected_cols.append(c)
if c + '_is_zero' in X_train.columns:
selected_cols.append(c + '_is_zero')
# ohe columns
ohe_cols = [c for c in X_train.columns if any(c.startswith(p + '_') for p in cat_vars)]
selected_cols += ohe_cols
if 'total_credict_card_number_bin' in X_train.columns:
# 使用分箱的 one-hot
selected_cols.append('total_credict_card_number_bin')
# 如果 bin 列存在,将其转为 dummies
if 'total_credict_card_number_bin' in selected_cols:
bin_dummies_train = pd.get_dummies(X_train['total_credict_card_number_bin'], prefix='tc_bin')
bin_dummies_test = pd.get_dummies(X_test['total_credict_card_number_bin'], prefix='tc_bin')
# 对齐列
bin_dummies_train, bin_dummies_test = bin_dummies_train.align(bin_dummies_test, join='outer', axis=1, fill_value=0)
X_train = pd.concat([X_train, bin_dummies_train], axis=1)
X_test = pd.concat([X_test, bin_dummies_test], axis=1)
# 替换 selected_cols
selected_cols = [c for c in selected_cols if c != 'total_credict_card_number_bin']
selected_cols += list(bin_dummies_train.columns)
# 最终特征列表中保留存在于 X_train 的列
selected_cols = [c for c in selected_cols if c in X_train.columns]
X_train_final = X_train[selected_cols].fillna(0)
X_test_final = X_test[selected_cols].fillna(0)
# sanitize feature names to be safe for xgboost (no [, ], <, > etc.)
orig_cols = X_train_final.columns.tolist()
safe_cols = ['f_{}_{}'.format(i, re.sub(r'[^0-9a-zA-Z_]', '_', str(c))) for i, c in enumerate(orig_cols)]
rename_map = dict(zip(orig_cols, safe_cols))
X_train_final.rename(columns=rename_map, inplace=True)
X_test_final.rename(columns=rename_map, inplace=True)
# 训练并预测:线性回归和随机森林
# 1. 线性回归
lr = LinearRegression()
lr.fit(X_train_final, y_train)
lr_pred = lr.predict(X_test_final)
# 2. 随机森林(使用网格搜索优化参数)
rf = RandomForestRegressor(random_state=42)
rf_params = {'n_estimators': [100, 200], 'max_depth': [5, 10, None], 'min_samples_split': [2, 5]}
gs_rf = GridSearchCV(rf, rf_params, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gs_rf.fit(X_train_final, y_train)
rf_pred = gs_rf.predict(X_test_final)
# 两模型平均
pred = (lr_pred*0.1 + rf_pred*0.9)
pred = np.clip(pred, 0, 1)
# 输出
output_file = '提交结果7.csv'
df_submit = pd.DataFrame({'id': df_test[id_col], 'target': np.round(pred, 6)})
df_submit.to_csv(output_file, index=False)
print(f'已完成:预处理->训练->预测,结果保存在 {output_file}')
加入BCWS
最新发布