import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
# 1. 数据读取与初步分析
train_df = pd.read_csv('./data/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('./data/house-prices-advanced-regression-techniques/test.csv')
# 保存测试集ID
test_ids = test_df['Id']
# 2. 数据预处理函数
def preprocess_data(df):
# 处理缺失值 - 分类特征
cat_na_fill = {
'Alley': 'NoAlley', 'BsmtQual': 'NoBsmt', 'BsmtCond': 'NoBsmt',
'BsmtExposure': 'NoBsmt', 'BsmtFinType1': 'NoBsmt', 'BsmtFinType2': 'NoBsmt',
'FireplaceQu': 'NoFireplace', 'GarageType': 'NoGarage', 'GarageFinish': 'NoGarage',
'GarageQual': 'NoGarage', 'GarageCond': 'NoGarage', 'PoolQC': 'NoPool',
'Fence': 'NoFence', 'MiscFeature': 'None', 'MasVnrType': 'None'
}
df = df.fillna(cat_na_fill)
# 处理缺失值 - 数值特征 (用均值填充)
num_cols = df.select_dtypes(include=np.number).columns
num_imputer = SimpleImputer(strategy='mean')
df[num_cols] = num_imputer.fit_transform(df[num_cols])
return df
# 3. 应用预处理
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
# 4. 特征工程
target = 'SalePrice'
X_train = train_df.drop(columns=['Id', target])
y_train = train_df[target]
# 分离数值和分类特征
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()
# One-Hot编码分类特征 - 使用sparse_output替代已弃用的sparse
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_cat = ohe.fit_transform(X_train[cat_cols])
# 创建特征数据框
X_processed = pd.DataFrame(
np.hstack([X_train[num_cols].values, X_cat]),
columns=num_cols + ohe.get_feature_names_out(cat_cols).tolist()
)
# 5. 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_processed)
y_log = np.log1p(y_train) # 对数转换目标变量
# 6. PyTorch模型定义
class HousePriceModel(nn.Module):
def __init__(self, input_size):
super().__init__()
self.linear = nn.Linear(input_size, 1)
def forward(self, x):
return self.linear(x)
# 7. 训练配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
results = []
final_predictions = np.zeros(len(test_df))
# 8. K折交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
print(f"\n=== Fold {fold+1}/{k_folds} ===")
# 数据拆分
X_train_fold, X_val_fold = X_scaled[train_idx], X_scaled[val_idx]
y_train_fold, y_val_fold = y_log.iloc[train_idx], y_log.iloc[val_idx]
# 转换为PyTorch张量
train_data = torch.tensor(X_train_fold, dtype=torch.float32).to(device)
train_labels = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1).to(device)
val_data = torch.tensor(X_val_fold, dtype=torch.float32).to(device)
val_labels = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1).to(device)
# 初始化模型
model = HousePriceModel(input_size=X_scaled.shape[1]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练循环
epochs = 500
for epoch in range(epochs):
optimizer.zero_grad()
outputs = model(train_data)
loss = criterion(outputs, train_labels)
loss.backward()
optimizer.step()
if (epoch+1) % 100 == 0:
print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")
# 验证预测
with torch.no_grad():
model.eval()
val_preds = model(val_data).cpu().numpy().flatten()
# 计算RMSLE (对数均方根误差)
rmsle = np.sqrt(mean_squared_log_error(
np.expm1(y_val_fold),
np.expm1(val_preds)
))
results.append(rmsle)
print(f"Validation RMSLE: {rmsle:.5f}")
# 测试集预测
X_test = test_df.drop(columns=['Id'])
X_test_cat = ohe.transform(X_test[cat_cols])
X_test_final = np.hstack([X_test[num_cols].values, X_test_cat])
X_test_scaled = scaler.transform(X_test_final)
test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)
with torch.no_grad():
fold_preds = model(test_tensor).cpu().numpy().flatten()
final_predictions += np.expm1(fold_preds) / k_folds
# 9. 结果分析
print("\n=== Cross-validation Results ===")
print(f"Mean RMSLE: {np.mean(results):.5f}")
print(f"Std RMSLE: {np.std(results):.5f}")
# 10. 生成提交文件
submission = pd.DataFrame({
'Id': test_ids,
'SalePrice': final_predictions
})
submission.to_csv('submission3.csv', index=False)
print("Submission file saved successfully!")
ValueError Traceback (most recent call last)
Cell In[66], line 118
115 val_preds = model(val_data).cpu().numpy().flatten()
117 # 计算RMSLE (对数均方根误差)
--> 118 rmsle = np.sqrt(mean_squared_log_error(
119 np.expm1(y_val_fold),
120 np.expm1(val_preds)
121 ))
122 results.append(rmsle)
123 print(f"Validation RMSLE: {rmsle:.5f}")
File ~\.conda\envs\d2l\lib\site-packages\sklearn\utils\_param_validation.py:216, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
210 try:
211 with config_context(
212 skip_parameter_validation=(
213 prefer_skip_nested_validation or global_skip_validation
214 )
215 ):
--> 216 return func(*args, **kwargs)
217 except InvalidParameterError as e:
218 # When the function is just a wrapper around an estimator, we allow
219 # the function to delegate validation to the estimator, but we replace
220 # the name of the estimator by the name of the function in the error
221 # message to avoid confusion.
222 msg = re.sub(
223 r"parameter of \w+ must be",
224 f"parameter of {func.__qualname__} must be",
225 str(e),
226 )
File ~\.conda\envs\d2l\lib\site-packages\sklearn\metrics\_regression.py:746, in mean_squared_log_error(y_true, y_pred, sample_weight, multioutput)
741 _, y_true, y_pred, _, _ = _check_reg_targets_with_floating_dtype(
742 y_true, y_pred, sample_weight, multioutput, xp=xp
743 )
745 if xp.any(y_true <= -1) or xp.any(y_pred <= -1):
--> 746 raise ValueError(
747 "Mean Squared Logarithmic Error cannot be used when "
748 "targets contain values less than or equal to -1."
749 )
751 return mean_squared_error(
752 xp.log1p(y_true),
753 xp.log1p(y_pred),
754 sample_weight=sample_weight,
755 multioutput=multioutput,
756 )
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain values less than or equal to -1.