pandas.DataFrame.corr &scatter_matrix计算各个属性之间相关系数

本文通过使用Pandas库进行数据探索,分析了房价数据集中的相关性,并利用matplotlib和scatter_matrix函数创建了属性之间的散点图矩阵,揭示了中位数房价与收入、房间数量等关键因素的关系。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

官方文档

参考博客

corr_matrix=housing.corr()
print(corr_matrix)

print(corr_matrix["median_house_value"].sort_values(ascending=False))

pandas.plotting.scatter_matrix官方文档

from pandas.tools.plotting import scatter_matrix
attributes=["median_house_value","median_income","total_rooms","housing_median_age"]
scatter_matrix(housing[attributes],figsize=(12,8)

import numpy as np import statsmodels.tsa.stattools as sts import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import statsmodels.api as sm # X = np.random.randn(1000) # Y = np.random.randn(1000) # plt.scatter(X,Y) # plt.show() data = pd.DataFrame(pd.read_excel(r'C:\Users\ivanss\Desktop\groud.xlsx')) # X = np.array(data[['Water heat']]) # Y = np.array(data[['pH']]) import numpy as np from scipy.stats import pearsonr #输入数组 x = np.array(data[['Water heat']]) y = np.array(data[['pH']]) #从二维数组转变成一维数组 x = x.squeeze() y = y.squeeze() print(x.shape, y.shape) # 检测无效值 # 将NaN和inf替换为可过滤的值 x_clean = x[~np.isnan(x) & ~np.isinf(x)] y_clean = y[~np.isnan(y) & ~np.isinf(y)] # 确保x和y长度一致 min_length = min(len(x_clean), len(y_clean)) x_final = x_clean[:min_length] y_final = y_clean[:min_length] # print(x_final) # 添加极其小的偏移量,避免除0 x = np.log(x + 1e-10) y = np.log(y + 1e-10) #输出pearsonr相关系数 from scipy.stats import pearsonr corr, p_value = pearsonr(x_final, y_final) print("水温和PH相关系数:", corr) x = np.array(data[['Water heat']]) y = np.array(data[['DO']]) #从二维数组转变成一维数组 x = x.squeeze() y = y.squeeze() print(x.shape, y.shape) # 检测无效值 # 将NaN和inf替换为可过滤的值 x_clean = x[~np.isnan(x) & ~np.isinf(x)] y_clean = y[~np.isnan(y) & ~np.isinf(y)] # 确保x和y长度一致 min_length = min(len(x_clean), len(y_clean)) x_final = x_clean[:min_length] y_final = y_clean[:min_length] # print(x_final) # 添加极其小的偏移量,避免除0 x = np.log(x + 1e-10) y = np.log(y + 1e-10) #输出pearsonr相关系数 from scipy.stats import pearsonr corr, p_value = pearsonr(x_final, y_final) print("水温和DO相关系数:", corr) 我想将每一列的相关系列都测出来 汇聚成heatmap 同时缺失值和异常值选择用填充而不是删除 该怎么做
03-08
import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import f_classif from statsmodels.stats.outliers_influence import variance_inflation_factor # 1. 计算特征相关性矩阵 corr_matrix = X.corr() plt.figure(figsize=(15, 12)) sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size": 8}, cbar_kws={"shrink": 0.8}) plt.title("特征相关性热力图") plt.show() # 2. 计算VIF(方差膨胀因子)检测多重共线性 vif_data = pd.DataFrame() vif_data["Feature"] = feature_columns vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(feature_columns))] print("多重共线性诊断 (VIF > 5 表示高共线性):") print(vif_data.sort_values("VIF", ascending=False)) # 3. 创建统一的重要性对比表 # 计算F检验重要性(单变量) f_scores, _ = f_classif(X, y) f_importance = f_scores / f_scores.max() # 标准化 # 计算模型系数重要性(多变量) coef_importance = np.abs(model.coef_[0]) coef_importance = coef_importance / coef_importance.max() # 标准化 # 创建对比DataFrame importance_df = pd.DataFrame({ "Feature": feature_columns, "F_Importance": f_importance, "Coef_Importance": np.nan # 初始化为NaN }) # 仅填充被选择的特征 selected_indices = selector.get_support(indices=True) for idx in selected_indices: importance_df.at[idx, "Coef_Importance"] = coef_importance[list(selected_indices).index(idx)] # 添加差异指标 importance_df["Importance_Diff"] = np.abs( importance_df["F_Importance"] - importance_df["Coef_Importance"] ) print("\n统一特征重要性对比:") print(importance_df.sort_values("Importance_Diff", ascending=False)) # 4. 可视化对比 plt.figure(figsize=(14, 8)) plt.scatter(importance_df["F_Importance"], importance_df["Coef_Importance"], s=100) # 添加标签和参考线 for i, row in importance_df.iterrows(): if not np.isnan(row["Coef_Importance"]): plt.text(row["F_Importance"] + 0.02, row["Coef_Importance"] + 0.02, row["Feature"], fontsize=9) else: plt.text(row["F_Importance"] + 0.02, 0.02, f"{row['Feature']} (未选择)", fontsize=9, color="red") plt.axline((0, 0), slope=1, color="red", linestyle="--", alpha=0.5) plt.xlabel("单变量重要性 (F检验)") plt.ylabel("多变量重要性 (模型系数绝对值)") plt.title("单变量与多变量特征重要性对比") plt.grid(True, alpha=0.3) plt.show() # 5. 基于领域知识调整模型 # 示例:强制包含临床重要特征 clinical_features = ["年龄", "GCS", "意识情况"] # 临床重要特征 # 创建新的特征选择器 from sklearn.base import BaseEstimator, TransformerMixin class ClinicalFeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, clinical_features, k=10): self.clinical_features = clinical_features self.k = k self.selector = None def fit(self, X, y=None): # 首先确保包含临床重要特征 clinical_indices = [list(X.columns).index(f) for f in self.clinical_features if f in X.columns] # 使用SelectKBest选择其他特征 self.selector = SelectKBest(f_classif, k=self.k - len(clinical_indices)) other_features = [f for f in X.columns if f not in self.clinical_features] self.selector.fit(X[other_features], y) return self def transform(self, X): clinical_data = X[self.clinical_features].values other_data = self.selector.transform(X[[f for f in X.columns if f not in self.clinical_features]]) return np.hstack([clinical_data, other_data]) def get_support(self): clinical_mask = [True if f in self.clinical_features else False for f in feature_columns] other_mask = self.selector.get_support() return np.array(clinical_mask + list(other_mask)) # 使用新的特征选择器 clinical_selector = ClinicalFeatureSelector(clinical_features=clinical_features, k=10) X_clinical = clinical_selector.fit_transform(X, y) # 重新训练模型 model_clinical = LogisticRegression(max_iter=1000, random_state=42) model_clinical.fit(X_clinical, y_res) # 比较特征选择结果 print("\n原始特征选择 vs 临床调整特征选择:") print("原始选择:", [feature_columns[i] for i in selector.get_support(indices=True)]) print("临床调整:", [feature_columns[i] for i in clinical_selector.get_support(indices=True)]) 显示好多未引用,改
07-23
用老师的口吻逐行分析以下线性回归的代码,我要把他讲给我的学生。# -*- coding: utf-8 -*- from __future__ import print_function import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn import linear_model from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, r2_score import warnings from typing import Tuple, Any from matplotlib.font_manager import FontProperties # 设置中文字体 font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=12) warnings.filterwarnings('ignore') # 设置matplotlib样式 plt.style.use('default') # 使用默认样式,而不是seaborn样式 class HousePricePredictor: def __init__(self): self.scaler = StandardScaler() self.model = linear_model.LinearRegression() self.X_train = None self.X_test = None self.y_train = None self.y_test = None def load_data(self, filename: str) -> Tuple[np.ndarray, np.ndarray]: """加载并预处理数据""" try: data = np.loadtxt(filename, delimiter=",", dtype=np.float64) X = data[:, 0:-1] y = data[:, -1] return X, y except Exception as e: print(f"加载数据时出错: {str(e)}") raise def prepare_data(self, X: np.ndarray, y: np.ndarray, test_size: float = 0.2): """数据准备和划分""" self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, test_size=test_size, random_state=42 ) self.X_train = self.scaler.fit_transform(self.X_train) self.X_test = self.scaler.transform(self.X_test) def train_model(self): """训练模型""" self.model.fit(self.X_train, self.y_train) def evaluate_model(self): """模型评估""" y_pred = self.model.predict(self.X_test) mse = mean_squared_error(self.y_test, y_pred) rmse = np.sqrt(mse) r2 = r2_score(self.y_test, y_pred) print("\n模型评估结果:") print(f"均方误差 (MSE): {mse:.2f}") print(f"均方根误差 (RMSE): {rmse:.2f}") print(f"决定系数 (R²): {r2:.4f}") cv_scores = cross_val_score(self.model, self.X_train, self.y_train, cv=5, scoring='r2') print(f"\n5折交叉验证 R² 分数: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") def visualize_data(self, X: np.ndarray, y: np.ndarray): """数据可视化""" try: df = pd.DataFrame(X, columns=['面积', '卧室数']) df['价格'] = y # 相关性热力图 plt.figure(figsize=(8, 6)) correlation_matrix = df.corr() sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0) plt.title('特征相关性热力图', fontproperties=font) plt.tight_layout() plt.show() # 面积与价格的散点图 plt.figure(figsize=(10, 6)) plt.scatter(df['面积'], df['价格'], alpha=0.5) plt.xlabel('面积', fontproperties=font) plt.ylabel('价格', fontproperties=font) plt.title('面积与价格关系图', fontproperties=font) plt.tight_layout() plt.show() # 卧室数与价格的箱线图 plt.figure(figsize=(10, 6)) sns.boxplot(x='卧室数', y='价格', data=df) plt.xlabel('卧室数', fontproperties=font) plt.ylabel('价格', fontproperties=font) plt.title('卧室数与价格分布图', fontproperties=font) plt.tight_layout() plt.show() # 价格分布图 plt.figure(figsize=(10, 6)) sns.histplot(df['价格'], kde=True) plt.title('房价分布图', fontproperties=font) plt.xlabel('价格', fontproperties=font) plt.ylabel('频数', fontproperties=font) plt.tight_layout() plt.show() except Exception as e: print(f"可视化过程中出错: {str(e)}") def visualize_predictions(self): """预测结果可视化""" try: y_pred = self.model.predict(self.X_test) plt.figure(figsize=(10, 6)) plt.scatter(self.y_test, y_pred, c='blue', alpha=0.5) plt.plot([self.y_test.min(), self.y_test.max()], [self.y_test.min(), self.y_test.max()], 'r--', lw=2) plt.xlabel('实际价格', fontproperties=font) plt.ylabel('预测价格', fontproperties=font) plt.title('预测价格 vs 实际价格', fontproperties=font) plt.tight_layout() plt.show() except Exception as e: print(f"预测可视化过程中出错: {str(e)}") def predict_new(self, features: np.ndarray) -> float: """预测新数据""" features_scaled = self.scaler.transform(features.reshape(1, -1)) return self.model.predict(features_scaled)[0] def print_model_params(self): """打印模型参数""" print("\n模型参数:") print(f"截距: {self.model.intercept_:.2f}") print("特征系数:") for i, coef in enumerate(['面积系数', '卧室数系数']): print(f"{coef}: {self.model.coef_[i]:.2f}") def main(): predictor = HousePricePredictor() try: print("正在加载数据...\n") X, y = predictor.load_data("data.txt") print("生成数据可视化...\n") predictor.visualize_data(X, y) predictor.prepare_data(X, y) print("训练模型...\n") predictor.train_model() predictor.print_model_params() predictor.evaluate_model() print("\n生成预测结果可视化...\n") predictor.visualize_predictions() test_features = np.array([1688, 3]) predicted_price = predictor.predict_new(test_features) print(f"\n预测房价 (面积=1688, 卧室数=3): ${predicted_price:,.2f}") except Exception as e: print(f"发生错误: {str(e)}") if __name__ == "__main__": main()
最新发布
08-09
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值