波士顿房价预测实战

啊啊啊阿西吧

已于 2025-04-26 20:03:04 修改

阅读量265

点赞数 9

CC 4.0 BY-SA版权

文章标签： python 机器学习

于 2025-04-26 00:27:10 首次发布

本文链接：https://blog.youkuaiyun.com/2301_81248263/article/details/147523786

波士顿房价预测完整实战教程（附Python代码）

温馨提示：波士顿数据集存在伦理争议，本教程仅用于技术演示，建议在实际项目中使用更合规的数据集

人工智能
机器学习
Python
Kaggle
数据分析
🔥 项目背景

为什么选择房价预测作为第一个机器学习项目？

数据特征直观（面积、房间数等）
适合练习回归任务评估指标（MAE/MSE）
Kaggle经典入门竞赛项目

环境准备

# 所需库安装（若未安装）
!pip install pandas numpy matplotlib seaborn scikit-learn

# 基础库导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统字体
plt.rcParams['axes.unicode_minus'] = False   # 解决负号显示

数据加载与探索

from sklearn.datasets import fetch_openml

# 加载数据集（带伦理警告）
boston = fetch_openml(name="boston", version=1, as_frame=True)
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['MEDV'] = boston.target  # 添加目标列

print("数据集维度:", df.shape)
print("\n前3行示例:")
display(df.head(3))

# 特征说明（简略版）
"""
CRIM: 犯罪率    RM: 房间数    LSTAT: 低收入人群比例  
DIS: 就业中心距离   PTRATIO: 师生比    MEDV: 房价中位数
"""

数据可视化分析

缺失值检查

plt.figure(figsize=(12,6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("缺失值分布热力图")
plt.savefig('missing.png', dpi=300, bbox_inches='tight')
plt.show()

在这里插入图片描述

特征分布直方图

df.hist(bins=30, figsize=(15,12))
plt.suptitle("特征分布直方图", y=1.02)
plt.savefig('hist.png', dpi=300, bbox_inches='tight')
plt.show()

在这里插入图片描述

关键特征与房价

fig, axs = plt.subplots(2,2, figsize=(15,12))
features = ['LSTAT', 'RM', 'PTRATIO', 'CRIM']
for ax, feat in zip(axs.flatten(), features):
    sns.scatterplot(x=df[feat], y=df['MEDV'], ax=ax)
    ax.set_title(f'{feat} vs 房价')
plt.tight_layout()
plt.savefig('scatter.png', dpi=300)
plt.show()

在这里插入图片描述

建模与评估

数据预处理

fig, axs = plt.subplots(2,2, figsize=(15,12))
features = ['LSTAT', 'RM', 'PTRATIO', 'CRIM']
for ax, feat in zip(axs.flatten(), features):
    sns.scatterplot(x=df[feat], y=df['MEDV'], ax=ax)
    ax.set_title(f'{feat} vs 房价')
plt.tight_layout()
plt.savefig('scatter.png', dpi=300)
plt.show()

模型训练与评估

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

models = {
    "线性回归": LinearRegression(),
    "随机森林": RandomForestRegressor(n_estimators=200, max_depth=8, random_state=42)
}

# 训练评估函数
def train_evaluate(model, X_tr, y_tr, X_val, y_val):
    model = make_pipeline(preprocessor, model)
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)
    return mean_absolute_error(y_val, preds)

# 交叉验证
results = {}
for name, model in models.items():
    mae = train_evaluate(model, X_train, y_train, X_test, y_test)
    results[name] = mae
    print(f"{name}测试MAE: {mae:.4f}")

# 输出最佳模型
best_model = min(results, key=results.get)
print(f"\n最佳模型：{best_model}，MAE：{results[best_model]:.4f}")

典型输出结果：

线性回归测试MAE: 3.1891
随机森林测试MAE: 2.4836

最佳模型：随机森林，MAE：2.4836

特征重要性分析

# 获取随机森林特征重要性
rf_model = make_pipeline(preprocessor, models["随机森林"])
rf_model.fit(X_train, y_train)

importances = rf_model.named_steps['randomforestregressor'].feature_importances_
feat_import = pd.Series(importances, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=feat_import.values, y=feat_import.index)
plt.title("特征重要性排序")
plt.xlabel("重要性得分")
plt.savefig('importance.png', dpi=300, bbox_inches='tight')
plt.show()

在这里插入图片描述

数据集：房屋价格预测
https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

代码实现（Pytho


# 波士顿房价预测实战（兼容scikit-learn 1.2+版本）
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# 解决中文显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']  # Windows系统可使用此设置
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

# 伦理警告声明
print("""\033[1;31m重要声明：
波士顿房价数据集存在伦理问题，包含可能带有偏见的人口统计特征。
建议在实际应用中使用更符合伦理标准的数据集（如加州房价数据集）。
本代码仅供教学演示用途。\033[0m
""")

# 数据加载
def load_data():
    # 使用OpenML的修正版本
    data = fetch_openml(name="boston", version=1, as_frame=True)
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df['MEDV'] = data.target  # 添加目标列
    
    print(f"\n数据集形状：{df.shape}")
    print("特征示例：")
    print(df.head(2))
    return df

# 探索性数据分析（EDA）
def perform_eda(df):
    # 缺失值检查
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cbar=False)
    plt.title("缺失值分布热力图")
    plt.savefig('boston_missing_values.png', bbox_inches='tight')
    plt.close()

    # 特征分布直方图
    df.hist(bins=30, figsize=(15, 12))
    plt.suptitle("特征分布直方图", y=1.02)
    plt.savefig('boston_feature_distribution.png', bbox_inches='tight')
    plt.close()

    # 相关性分析
    plt.figure(figsize=(12, 10))
    corr_matrix = df.corr()
    sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm',
                mask=np.triu(np.ones_like(corr_matrix, dtype=bool)))
    plt.title("特征相关性矩阵")
    plt.savefig('boston_correlation_matrix.png', bbox_inches='tight')
    plt.close()

    # 房价与关键特征的关系
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    features = ['LSTAT', 'RM', 'PTRATIO', 'CRIM']
    for ax, feature in zip(axes.flatten(), features):
        sns.scatterplot(x=df[feature], y=df['MEDV'], ax=ax)
        ax.set_title(f'{feature} vs MEDV')
    plt.tight_layout()
    plt.savefig('boston_key_features.png', bbox_inches='tight')
    plt.close()

# 建模流程
def main():
    df = load_data()
    perform_eda(df)

    # 数据准备
    X = df.drop("MEDV", axis=1)
    y = df["MEDV"]
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 构建管道（所有特征均为数值型）
    preprocessor = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler()
    )

    # 模型配置
    models = {
        "线性回归": make_pipeline(preprocessor, LinearRegression()),
        "随机森林": make_pipeline(preprocessor, RandomForestRegressor(
            n_estimators=200, 
            max_depth=8,
            random_state=42
        ))
    }

    # 交叉验证
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    results = {}

    for name, model in models.items():
        mae_scores = []
        for train_idx, val_idx in kf.split(X_train):
            # 数据划分
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            y_val = y_train.iloc[val_idx]

            # 训练预测
            model.fit(X_tr, y_tr)
            preds = model.predict(X_val)
            mae_scores.append(mean_absolute_error(y_val, preds))

        # 记录结果
        avg_mae = np.mean(mae_scores)
        results[name] = avg_mae
        print(f"[{name}] 平均MAE: {avg_mae:.4f}")

    # 最佳模型选择
    best_model_name = min(results, key=results.get)
    best_model = models[best_model_name]
    best_model.fit(X_train, y_train)

    # 最终评估
    test_preds = best_model.predict(X_test)
    final_mae = mean_absolute_error(y_test, test_preds)
    print(f"\n\033[1m最佳模型：{best_model_name} | 测试集MAE: {final_mae:.4f}\033[0m")

    # 特征重要性（随机森林）
    if hasattr(best_model.named_steps['randomforestregressor'], 'feature_importances_'):
        importances = best_model.named_steps['randomforestregressor'].feature_importances_
        features = X.columns
        importance_df = pd.DataFrame({'特征': features, '重要性': importances})
        importance_df = importance_df.sort_values('重要性', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x='重要性', y='特征', data=importance_df)
        plt.title("特征重要性排序")
        plt.savefig('boston_feature_importance.png', bbox_inches='tight')
        plt.close()

if __name__ == "__main__":
    main()
```![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/dbb0731b08fb4e73bbbb680eef60e77a.png#pic_center)
## 4. 模型对比结果
![在这里插入图片描述](https://i-blog.csdnimg.cn/direct/f149ebd9e00f4c16ae6a47595115e8a1.png)
## 5效果优化方案
5.1 特征工程进阶
```python
# 添加地理位置组合特征
df['lat_long'] = df['Latitude'] * df['Longitude']
from sklearn.ensemble import StackingRegressor

estimators = [
    ('rf', RandomForestRegressor()),
    ('lr', LinearRegression())
]

stacking_model = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor()
)