用老师的口吻逐行分析以下线性回归的代码,我要把他讲给我的学生。# -*- coding: utf-8 -*-
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import warnings
from typing import Tuple, Any
from matplotlib.font_manager import FontProperties
# 设置中文字体
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=12)
warnings.filterwarnings('ignore')
# 设置matplotlib样式
plt.style.use('default') # 使用默认样式,而不是seaborn样式
class HousePricePredictor:
def __init__(self):
self.scaler = StandardScaler()
self.model = linear_model.LinearRegression()
self.X_train = None
self.X_test = None
self.y_train = None
self.y_test = None
def load_data(self, filename: str) -> Tuple[np.ndarray, np.ndarray]:
"""加载并预处理数据"""
try:
data = np.loadtxt(filename, delimiter=",", dtype=np.float64)
X = data[:, 0:-1]
y = data[:, -1]
return X, y
except Exception as e:
print(f"加载数据时出错: {str(e)}")
raise
def prepare_data(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2):
"""数据准备和划分"""
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
X, y, test_size=test_size, random_state=42
)
self.X_train = self.scaler.fit_transform(self.X_train)
self.X_test = self.scaler.transform(self.X_test)
def train_model(self):
"""训练模型"""
self.model.fit(self.X_train, self.y_train)
def evaluate_model(self):
"""模型评估"""
y_pred = self.model.predict(self.X_test)
mse = mean_squared_error(self.y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(self.y_test, y_pred)
print("\n模型评估结果:")
print(f"均方误差 (MSE): {mse:.2f}")
print(f"均方根误差 (RMSE): {rmse:.2f}")
print(f"决定系数 (R²): {r2:.4f}")
cv_scores = cross_val_score(self.model, self.X_train, self.y_train,
cv=5, scoring='r2')
print(f"\n5折交叉验证 R² 分数: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
def visualize_data(self, X: np.ndarray, y: np.ndarray):
"""数据可视化"""
try:
df = pd.DataFrame(X, columns=['面积', '卧室数'])
df['价格'] = y
# 相关性热力图
plt.figure(figsize=(8, 6))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('特征相关性热力图', fontproperties=font)
plt.tight_layout()
plt.show()
# 面积与价格的散点图
plt.figure(figsize=(10, 6))
plt.scatter(df['面积'], df['价格'], alpha=0.5)
plt.xlabel('面积', fontproperties=font)
plt.ylabel('价格', fontproperties=font)
plt.title('面积与价格关系图', fontproperties=font)
plt.tight_layout()
plt.show()
# 卧室数与价格的箱线图
plt.figure(figsize=(10, 6))
sns.boxplot(x='卧室数', y='价格', data=df)
plt.xlabel('卧室数', fontproperties=font)
plt.ylabel('价格', fontproperties=font)
plt.title('卧室数与价格分布图', fontproperties=font)
plt.tight_layout()
plt.show()
# 价格分布图
plt.figure(figsize=(10, 6))
sns.histplot(df['价格'], kde=True)
plt.title('房价分布图', fontproperties=font)
plt.xlabel('价格', fontproperties=font)
plt.ylabel('频数', fontproperties=font)
plt.tight_layout()
plt.show()
except Exception as e:
print(f"可视化过程中出错: {str(e)}")
def visualize_predictions(self):
"""预测结果可视化"""
try:
y_pred = self.model.predict(self.X_test)
plt.figure(figsize=(10, 6))
plt.scatter(self.y_test, y_pred, c='blue', alpha=0.5)
plt.plot([self.y_test.min(), self.y_test.max()],
[self.y_test.min(), self.y_test.max()],
'r--', lw=2)
plt.xlabel('实际价格', fontproperties=font)
plt.ylabel('预测价格', fontproperties=font)
plt.title('预测价格 vs 实际价格', fontproperties=font)
plt.tight_layout()
plt.show()
except Exception as e:
print(f"预测可视化过程中出错: {str(e)}")
def predict_new(self, features: np.ndarray) -> float:
"""预测新数据"""
features_scaled = self.scaler.transform(features.reshape(1, -1))
return self.model.predict(features_scaled)[0]
def print_model_params(self):
"""打印模型参数"""
print("\n模型参数:")
print(f"截距: {self.model.intercept_:.2f}")
print("特征系数:")
for i, coef in enumerate(['面积系数', '卧室数系数']):
print(f"{coef}: {self.model.coef_[i]:.2f}")
def main():
predictor = HousePricePredictor()
try:
print("正在加载数据...\n")
X, y = predictor.load_data("data.txt")
print("生成数据可视化...\n")
predictor.visualize_data(X, y)
predictor.prepare_data(X, y)
print("训练模型...\n")
predictor.train_model()
predictor.print_model_params()
predictor.evaluate_model()
print("\n生成预测结果可视化...\n")
predictor.visualize_predictions()
test_features = np.array([1688, 3])
predicted_price = predictor.predict_new(test_features)
print(f"\n预测房价 (面积=1688, 卧室数=3): ${predicted_price:,.2f}")
except Exception as e:
print(f"发生错误: {str(e)}")
if __name__ == "__main__":
main()
最新发布