上一种解法,实际在Kaggle提交时,获得的分数不高,大概是0.55897左右,而且对于数据的处理其实也挺粗糙,没有啥技术可言,所以我也并不甘心,因此结合了AI,写了一段程序,预测后有了质的飞跃,先把程序贴出来,后面一段段分析。
# ========== 1) Imports & Paths ==========
import os
import numpy as np
import pandas as pd
# 读取训练和测试数据
# data_dir 为数据目录(与 Kaggle Notebook 的 /kaggle/input/... 类似,但这里用本地相对路径)
data_dir = "./input/"
train = pd.read_csv(os.path.join(data_dir, "train.csv"))
test = pd.read_csv(os.path.join(data_dir, "test.csv"))
# ---------- 2) 实用函数:强特征、补缺、有序映射 ----------
# 该部分集中放数据清洗与特征工程中会复用的小函数
# 2.1 LotFrontage 按 Neighborhood(社区)分组的中位数进行缺失填补
# 原因:不同社区的临街长度分布不同,分组填补往往比全局中位数更合理
def impute_lotfrontage_by_neighborhood(df):
df = df.copy() # copy 防止原表被就地修改
if "LotFrontage" in df.columns and "Neighborhood" in df.columns:
# 对每条记录,取其所在 Neighborhood 的 LotFrontage 中位数
med = df.groupby("Neighborhood")["LotFrontage"].transform("median")
# 用该中位数填补缺失值;若个别社区全缺失,依然会留下 NaN(后续流程再兜底)
df["LotFrontage"] = df["LotFrontage"].fillna(med)
return df
# 2.2 常识型强特征构造
# 这些特征在房价题上非常常见且有效(面积合成、浴室总量/密度、门廊合并、房龄等)
def add_strong_features(df):
df = df.copy()
# 总面积:地上居住面积 + 地下室总面积(未完工面积也纳入)
df["TotalSF"] = df.get("GrLivArea", 0) + df.get("TotalBsmtSF", 0)
# 总“有效浴室数”:半卫按 0.5 计入(地上、地下分别统计后合并)
df["BathTotal"] = (df.get("FullBath", 0) + 0.5 * df.get("HalfBath", 0) +
df.get("BsmtFullBath", 0) + 0.5 * df.get("BsmtHalfBath", 0))
# 浴室密度(每 1000 平方英尺的浴室数),刻画“是否拥挤/舒适”
# replace(0, np.nan) 避免除以 0,/1000 仅做尺度调整
df["BathPer1kSF"] = df["BathTotal"] / (df["TotalSF"].replace(0, np.nan) / 1000.0)
# 门廊/露台面积合并:把多个门廊面积相加,降低稀疏性
df["PorchSF"] = (df.get("OpenPorchSF", 0) + df.get("EnclosedPorch", 0) +
df.get("3SsnPorch", 0) + df.get("ScreenPorch", 0))
# 房龄、翻新年距、是否“新房”(售出年等于建造年)
if {"YrSold", "YearBuilt"}.issubset(df.columns):
df["HouseAge"] = (df["YrSold"] - df["YearBuilt"]).clip(lower=0) # clip 保证非负
if {"YrSold", "YearRemodAdd"}.issubset(df.columns):
df["SinceRemod"] = (df["YrSold"] - df["YearRemodAdd"]).clip(lower=0)
if {"YrSold", "YearBuilt"}.issubset(df.columns):
df["IsNew"] = (df["YrSold"] == df["YearBuilt"]).astype(int)
# 三个“是否存在”类布尔变量,很多模型对“有/无”的信号很敏感
df["HasGarage"] = (df.get("GarageArea", 0) > 0).astype(int)
df["HasFireplace"] = (df.get("Fireplaces", 0) > 0).astype(int)
df["HasPool"] = (df.get("PoolArea", 0) > 0).astype(int)
return df
# 2.3 有序质量列 -> 数值映射
# 这些列原来是文本(Ex/Gd/TA/Fa/Po),本质是“有序类别”
# 映射为 5~1 的分值;缺失/NA 映射为 0,表示“无/不可用”
ORD_MAP = {"Ex": 5, "Gd": 4, "TA": 3, "Fa": 2, "Po": 1, np.nan: 0, "NA": 0}
ORD_COLS = [
"ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "HeatingQC",
"KitchenQual", "FireplaceQu", "GarageQual", "GarageCond"
]
def map_ordered_qualities(df):
df = df.copy()
for c in ORD_COLS:
if c in df.columns:
# map + fillna 双保险;astype(float) 方便后续数值处理
df[c] = df[c].map(ORD_MAP).fillna(0).astype(float)
return df
# 2.4 修正 GarageYrBlt 异常(例如年份越界等)
# 若出现不合理年份:小于 1800 或 大于售出年份 YrSold,则用 YearBuilt 回填;否则置 NaN
def fix_garage_year(df):
df = df.copy()
if "GarageYrBlt" in df.columns:
mask = (df["GarageYrBlt"].notna())
if "YrSold" in df.columns:
bad = (df["GarageYrBlt"] < 1800) | (df["GarageYrBlt"] > df["YrSold"])
else:
bad = (df["GarageYrBlt"] < 1800)
if "YearBuilt" in df.columns:
df.loc[mask & bad, "GarageYrBlt"] = df.loc[mask & bad, "YearBuilt"]
else:
df.loc[mask & bad, "GarageYrBlt"] = np.nan
return df
# 2.5 OOF(Out-Of-Fold)目标编码:对指定类别列 col
# 使用 log1p(SalePrice) 的“分折外均值”进行编码,并做“平滑”以降低高方差类别的过拟合风险
from sklearn.model_selection import KFold
def oof_target_encode(X, y, X_test, col, n_splits=5, smooth=20):
"""
X[col] -> 新增列 f"{col}_te";基于 OOF 的 log1p(y) 平滑均值编码,避免信息泄漏。
参数:
- X: 训练特征表(DataFrame)
- y: 训练目标(array-like,与 X 索引对齐)
- X_test: 测试特征表(DataFrame)
- col: 要编码的列名(类别列)
- n_splits: K 折数量
- smooth: 平滑系数(越大越靠近全局均值)
返回:
- X_new, X_test_new:分别附加了 f"{col}_te" 列的新 DataFrame
"""
X = X.copy();
X_test = X_test.copy()
y = pd.Series(y, index=X.index)
y_log = np.log1p(y) # 在对数空间计算均值,贴近评测指标
global_mean = y_log.mean() # 全局均值,用于平滑与缺失兜底
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
te = pd.Series(index=X.index, dtype=float) # 存放每个样本的 OOF 编码值
for tr_idx, va_idx in kf.split(X):
tr, va = X.iloc[tr_idx], X.iloc[va_idx]
y_tr = y_log.iloc[tr_idx]
# 计算“训练折”里每个类别的均值与计数
means = tr.groupby(col).apply(lambda d: y_tr[d.index].mean())
counts = tr[col].value_counts()
# 平滑:频次越小,越向全局均值靠拢
enc = (means * counts + global_mean * smooth) / (counts + smooth)
# 在“验证折”上查表映射;碰到未见类别,用全局均值兜底
te.iloc[va_idx] = X.iloc[va_idx][col].map(enc).fillna(global_mean)
# 测试集编码:使用全量训练数据的平滑均值
means_full = X.groupby(col).apply(lambda d: y_log[d.index].mean())
counts_full = X[col].value_counts()
enc_full = (means_full * counts_full + global_mean * smooth) / (counts_full + smooth)
te_test = X_test[col].map(enc_full).fillna(global_mean)
# 追加新列:f"{col}_te"
X[f"{col}_te"] = te.values
X_test[f"{col}_te"] = te_test.values
return X, X_test
# ---------- 3) 特征工程 + 清洗应用到 train/test ----------
# 通过 method chaining (DataFrame.pipe) 将前面定义的处理步骤串起来,保持代码整洁
train_fe = train.pipe(impute_lotfrontage_by_neighborhood) \
.pipe(fix_garage_year) \
.pipe(add_strong_features) \
.pipe(map_ordered_qualities)
test_fe = test.pipe(impute_lotfrontage_by_neighborhood) \
.pipe(fix_garage_year) \
.pipe(add_strong_features) \
.pipe(map_ordered_qualities)
# 从加工后的训练表中取出目标 y;同时准备训练特征 X、测试特征 X_test
y = train_fe["SalePrice"].values
X = train_fe.drop(columns=["SalePrice", "Id"]) # 训练特征去掉目标和 Id
X_test = test_fe.drop(columns=["Id"]) # 测试特征去掉 Id
# 选择要做 OOF 目标编码的“高价值类别列”(高基数/强信号)
# 可按具体实验增减;注意它们需要存在于 X.columns
high_card_cols = [c for c in ["Neighborhood", "Exterior1st", "Exterior2nd", "MasVnrType", "HouseStyle"]
if c in X.columns]
# 对所选的每个类别列做目标编码(会新增 *_te 列)
for c in high_card_cols:
X, X_test = oof_target_encode(X, y, X_test, c, n_splits=5, smooth=20)
# 根据 dtype 将列分成“数值列/非数值列”(One-Hot 只处理非数值列)
# 注意:2.3 中映射的有序质量列现在是数值类型,会被划入 num_cols
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
# ---------- 0) 验证校准:分层K折(按 log1p(y) 分桶) ----------
# 将连续目标离散成若干“分位桶”,再用 StratifiedKFold 保证每折分布相似 → 使本地 CV 更接近 Kaggle 公榜
from sklearn.model_selection import StratifiedKFold
y_log = np.log1p(y)
# q=10 表示切成 10 桶;duplicates="drop" 防止边界值重复导致报错
y_bins = pd.qcut(y_log, q=10, labels=False, duplicates="drop")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
stratified_splits = list(skf.split(X, y_bins))
print(f"[StratifiedKFold] folds: {len(stratified_splits)} 已建立。")
# ---------- 4) 预处理流水线(缺失 + 标准化 + One-Hot,自适配新旧 sklearn) ----------
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 数值流水线:中位数补缺 + 标准化(线性/L1/L2 模型收敛更稳)
numeric_preprocess = Pipeline(steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
# OneHot 新旧版本自适配(>=1.2 使用 sparse_output;老版本使用 sparse)
from sklearn import __version__ as skl_version
try:
from packaging import version
NEW_OHE = version.parse(skl_version) >= version.parse("1.2")
except Exception:
NEW_OHE = False
if NEW_OHE:
onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False) # 新版参数名
else:
# ⚠️ 旧版应使用 sparse=...;如果你只在新环境运行,这行保持 sparse_output 也不会触发
# 建议旧版改为:OneHotEncoder(handle_unknown="ignore", sparse=False)
onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
# 类别流水线:众数填补 + One-Hot;handle_unknown="ignore" 防止测试集中出现新类别时报错
categorical_preprocess = Pipeline(steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", onehot)
])
# 将数值与类别两路预处理并联到一起
preprocess = ColumnTransformer(
transformers=[
("num", numeric_preprocess, num_cols),
("cat", categorical_preprocess, cat_cols),
],
remainder="drop" # 丢弃未在 num/cat 指定的列(正常情况下不应有剩余)
)
# ---------- 5) 交叉验证评估(使用“分层折”) ----------
from sklearn.metrics import mean_squared_error
from sklearn.compose import TransformedTargetRegressor
def rmse_on_log_space(y_true, y_pred):
"""
在 log1p 空间上计算 RMSE(与 Kaggle 评测一致的思想);
同时兼容旧版 sklearn(mean_squared_error 没有 squared 参数)
"""
try:
return mean_squared_error(np.log1p(y_true), np.log1p(y_pred), squared=False)
except TypeError:
return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))
def rmse_cv(model):
"""
给定基础模型,包装成:预处理流水线 + 目标对数变换(TTR);
使用“第0步”生成的 stratified_splits 做 OOF 交叉验证,返回均值与标准差。
"""
pipe = Pipeline(steps=[
("prep", preprocess),
# TransformedTargetRegressor:在 log 空间训练,预测自动做逆变换(expm1)
("reg", TransformedTargetRegressor(regressor=model, func=np.log1p, inverse_func=np.expm1))
])
rmses = []
for tr_idx, va_idx in stratified_splits:
X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
y_tr, y_va = y[tr_idx], y[va_idx]
pipe.fit(X_tr, y_tr)
pred_va = pipe.predict(X_va)
rmses.append(rmse_on_log_space(y_va, pred_va))
return float(np.mean(rmses)), float(np.std(rmses))
# 候选模型集合:包含线性(Ridge/Lasso/ElasticNet)与树(HistGBR)
# 线性模型参数做了收敛性增强(max_iter/tol/selection),避免 ConvergenceWarning
from sklearn.linear_model import RidgeCV, LassoCV, ElasticNetCV
from sklearn.ensemble import HistGradientBoostingRegressor
candidates = {
"RidgeCV": RidgeCV(alphas=np.logspace(-3, 3, 25)),
"LassoCV": LassoCV(
alphas=np.logspace(-2, 1, 20), # 抬高下限,避免过小 alpha 导致不收敛
max_iter=200_000, tol=1e-3,
random_state=42, selection="random" # 随机坐标下降,收敛更稳
),
"ElasticNetCV": ElasticNetCV(
l1_ratio=[.1, .3, .5, .7, .9, .95, .99, 1.0],
alphas=np.logspace(-2, 1, 20),
max_iter=200_000, tol=1e-3,
random_state=42, selection="random"
),
"HistGBR": HistGradientBoostingRegressor(
learning_rate=0.06,
max_depth=None,
max_leaf_nodes=31,
min_samples_leaf=20,
l2_regularization=1e-3,
random_state=42
)
}
print("=== CV 对比(RMSE on log1p)===")
results = {}
for name, model in candidates.items():
m, s = rmse_cv(model)
results[name] = (m, s)
print(f"{name:12s}: {m:.5f} +/- {s:.5f}")
# 选取 CV 均值最低的模型名
best_name = min(results, key=lambda k: results[k][0])
print(f"\nBest (by CV): {best_name} -> {results[best_name]}")
# ---------- 4.1) 可选:针对 HistGBR 的小网格微调 ----------
# 仅当 HistGBR 在第5步最优时,才继续做小范围 GridSearch 微调;当然你也可以强制设 True
DO_HGB_TUNING = (best_name == "HistGBR")
best_estimator = None
if DO_HGB_TUNING:
from sklearn.model_selection import GridSearchCV
# 在“原始 SalePrice 空间”上评估 RMSE(有利于直观比较)
def rmse_raw(y_true, y_pred):
return np.sqrt(mean_squared_error(y_true, y_pred))
from sklearn.metrics import make_scorer
neg_rmse = make_scorer(rmse_raw, greater_is_better=False)
# 以“预处理 + 目标变换 + HGB”为基础的管道
hgb_pipe = Pipeline([
("prep", preprocess),
("reg", TransformedTargetRegressor(
regressor=HistGradientBoostingRegressor(random_state=42),
func=np.log1p, inverse_func=np.expm1))
])
# 轻量网格:学习率、叶子数、最小叶样本、L2 正则
param_grid = {
"reg__regressor__learning_rate": [0.03, 0.06, 0.1],
"reg__regressor__max_leaf_nodes": [31, 63, 127],
"reg__regressor__min_samples_leaf": [10, 20, 40],
"reg__regressor__l2_regularization": [1e-3, 3e-3, 1e-2]
}
# 使用与第0步完全一致的折分,以保证可比性(自定义 CV 迭代器)
class PredefinedCV:
def __iter__(self):
for tr, va in stratified_splits:
yield tr, va
def split(self, *args, **kwargs):
return self.__iter__()
def get_n_splits(self, *args, **kwargs):
return len(stratified_splits)
search = GridSearchCV(
estimator=hgb_pipe,
param_grid=param_grid,
scoring=neg_rmse,
cv=PredefinedCV(),
n_jobs=-1
)
search.fit(X, y)
print("\n[HGB Tuning] Best params:", search.best_params_)
print("[HGB Tuning] Best CV RMSE (raw SalePrice):", -search.best_score_)
best_estimator = search.best_estimator_
# ---------- 6) 训练全量、预测测试、导出提交 ----------
from sklearn.pipeline import Pipeline
if best_estimator is None:
# 若未做 4.1 微调:用第5步最优基础模型组装最终管道(预处理 + 目标变换 + 模型)
best_model = candidates[best_name]
final_pipe = Pipeline(steps=[
("prep", preprocess),
("reg", TransformedTargetRegressor(regressor=best_model, func=np.log1p, inverse_func=np.expm1))
])
else:
# 若做了 4.1:直接复用最佳估计器(已包含预处理与目标变换)
final_pipe = best_estimator
# 用全量训练数据拟合
final_pipe.fit(X, y)
# 对测试集做预测(返回的是原始空间的 SalePrice,因为 TTR 已做了逆变换)
test_pred = final_pipe.predict(X_test)
# 保险起见:价格不应为负,做一次下限裁剪
test_pred = np.clip(test_pred, 0, None)
# 生成提交文件(列名按 Kaggle 要求:Id, SalePrice)
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": test_pred})
SAVE_PATH = "submission.csv" if best_estimator is None else "submission_tuned.csv"
submission.to_csv(SAVE_PATH, index=False)
print(f"\nSaved => {os.path.abspath(SAVE_PATH)}")
# 提示:比较本地 CV 均值(results[best_name][0])与 Kaggle Public LB 的差距:
# 若差距 < 0.02(越小越好),说明验证策略已较好对齐,可以进一步做 stacking / 更多特征探索等。
上述程序,预测的结果提交后,得到了0.13588这样相比前一版有质的飞跃的结果,但是复杂度也变得更高了,后面我一段段分析。

被折叠的 条评论
为什么被折叠?



