(mu, sigma) = norm.fit(数据列)

本文探讨了线性模型在机器学习中的应用,强调了目标值正态分布的重要性。为了提高模型效果,文章介绍了如何处理数据列,获取其正态分布的均值和标准差。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

线性的模型需要正态分布的目标值才能发挥最大的作用。

所以在处理数据时候,需要对数据列进行处理,获取数据列正太分布的均值和标准差。

import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy.stats import genextreme, pearson3, norm from sklearn.mixture import GaussianMixture from sklearn.metrics import mean_squared_error import scipy.stats as stats # 设置中文字体支持 plt.rcParams["font.family"] = ["SimSun", "SimHei", "WenQuanYi Micro Hei", "Heiti TC"] plt.rcParams["axes.unicode_minus"] = False def load_temperature_data(file_path, sheet_name, column_name): """加载温度数据并过滤缺失值""" data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl') return data[column_name].dropna().values def gev_fit(data): """广义极值分布拟合""" c, loc, scale = genextreme.fit(data) return c, loc, scale def gaussian_mixture_fit(data, n_components=2): """混合高斯分布拟合""" data = data.reshape(-1, 1) gmm = GaussianMixture(n_components=n_components, random_state=42) gmm.fit(data) return gmm def pearson3_fit(data): """皮尔逊Ⅲ型分布拟合""" skew, loc, scale = pearson3.fit(data) return skew, loc, scale def normal_fit(data): """正态分布拟合""" mu, sigma = norm.fit(data) return mu, sigma def calculate_rmse(data, dist_func, *params): """计算拟合分布的RMSE""" x = np.sort(data) ecdf = np.arange(1, len(x) + 1) / len(x) cdf = dist_func(x, *params) return np.sqrt(mean_squared_error(ecdf, cdf)) def calculate_r2(data, dist_func, *params): """计算拟合分布的R²值(决定系数)""" x = np.sort(data) ecdf = np.arange(1, len(x) + 1) / len(x) cdf = dist_func(x, *params) # 计算总平方和(SS_tot)和残差平方和(SS_res) y_mean = np.mean(ecdf) ss_tot = np.sum((ecdf - y_mean) ** 2) ss_res = np.sum((ecdf - cdf) ** 2) # 计算R²(避免除零错误) if ss_tot == 0: return 1.0 # 所有数据点相同,R²=1 return 1 - (ss_res / ss_tot) def calculate_gmm_stats(gmm): """计算混合高斯分布的整体均值和标准差""" weights = gmm.weights_ means = gmm.means_.flatten() stds = np.sqrt(gmm.covariances_.flatten()) overall_mean = np.sum(weights * means) overall_variance = np.sum(weights * (stds ** 2 + means ** 2)) - overall_mean ** 2 return overall_mean, np.sqrt(overall_variance) def calculate_aic_bic(data, log_likelihood, n_params): """计算AIC和BIC""" n = len(data) aic = 2 * n_params - 2 * log_likelihood bic = n_params * np.log(n) - 2 * log_likelihood return aic, bic def plot_distribution_fits(data): """绘制多种分布拟合对比图,包含R²检验""" fig, ax1 = plt.subplots(figsize=(14, 10)) # 修改1: 将X轴范围设置为-18到-8 x_min, x_max = -18, -6 x_range = np.linspace(x_min, x_max, 300) # 1. 拟合各种分布并计算评估指标 # GEV分布 c_gev, loc_gev, scale_gev = gev_fit(data) pdf_gev = genextreme.pdf(x_range, c_gev, loc=loc_gev, scale=scale_gev) cdf_gev = genextreme.cdf(x_range, c_gev, loc=loc_gev, scale=scale_gev) rmse_gev = calculate_rmse(data, lambda x: genextreme.cdf(x, c_gev, loc=loc_gev, scale=scale_gev)) r2_gev = calculate_r2(data, lambda x: genextreme.cdf(x, c_gev, loc=loc_gev, scale=scale_gev)) log_likelihood_gev = np.sum(genextreme.logpdf(data, c_gev, loc=loc_gev, scale=scale_gev)) aic_gev, bic_gev = calculate_aic_bic(data, log_likelihood_gev, 3) # 混合高斯分布 gmm = gaussian_mixture_fit(data) components = [(gmm.weights_[i], gmm.means_[i][0], np.sqrt(gmm.covariances_[i][0][0])) for i in range(gmm.n_components)] # 计算混合高斯PDF/CDF pdf_gmm = np.sum([w * norm.pdf(x_range, m, s) for w, m, s in components], axis=0) cdf_gmm = np.sum([w * norm.cdf(x_range, m, s) for w, m, s in components], axis=0) rmse_gmm = calculate_rmse(data, lambda x: np.sum([w * norm.cdf(x, m, s) for w, m, s in components], axis=0)) r2_gmm = calculate_r2(data, lambda x: np.sum([w * norm.cdf(x, m, s) for w, m, s in components], axis=0)) log_likelihood_gmm = gmm.score(data.reshape(-1, 1)) * len(data) aic_gmm, bic_gmm = calculate_aic_bic(data, log_likelihood_gmm, gmm.n_components * 3 - 1) # 皮尔逊Ⅲ型分布 skew_pearson3, loc_pearson3, scale_pearson3 = pearson3_fit(data) pdf_pearson3 = pearson3.pdf(x_range, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3) cdf_pearson3 = pearson3.cdf(x_range, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3) rmse_pearson3 = calculate_rmse(data, lambda x: pearson3.cdf(x, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3)) r2_pearson3 = calculate_r2(data, lambda x: pearson3.cdf(x, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3)) log_likelihood_pearson3 = np.sum(pearson3.logpdf(data, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3)) aic_pearson3, bic_pearson3 = calculate_aic_bic(data, log_likelihood_pearson3, 3) # 正态分布 mu_norm, sigma_norm = normal_fit(data) pdf_norm = norm.pdf(x_range, mu_norm, sigma_norm) cdf_norm = norm.cdf(x_range, mu_norm, sigma_norm) rmse_norm = calculate_rmse(data, lambda x: norm.cdf(x, mu_norm, sigma_norm)) r2_norm = calculate_r2(data, lambda x: norm.cdf(x, mu_norm, sigma_norm)) log_likelihood_norm = np.sum(norm.logpdf(data, mu_norm, sigma_norm)) aic_norm, bic_norm = calculate_aic_bic(data, log_likelihood_norm, 2) # 2. 定义分布字典并确定最优分布(R²优先,R²相同时RMSE优先) distributions = { 'GEV分布': (r2_gev, rmse_gev, (pdf_gev, cdf_gev, 'r-', 'GEV分布')), '混合高斯分布': (r2_gmm, rmse_gmm, (pdf_gmm, cdf_gmm, 'g-', '混合高斯分布')), '皮尔逊Ⅲ型分布': (r2_pearson3, rmse_pearson3, (pdf_pearson3, cdf_pearson3, 'b-', '皮尔逊Ⅲ型分布')), '正态分布': (r2_norm, rmse_norm, (pdf_norm, cdf_norm, 'm-', '正态分布')) } # 按R²降序、RMSE升序排序 best_distribution = max(distributions.items(), key=lambda x: (x[1][0], -x[1][1]))[0] best_r2, best_rmse, (best_pdf, best_cdf, best_linestyle, best_label) = distributions[best_distribution] # 3. 绘制概率密度函数对比图 # 修改2: 调整直方图bins范围 bins = np.arange(x_min, x_max + 0.5, 0.5) ax1.hist(data, bins=bins, density=True, alpha=0.3, color='skyblue', edgecolor='black', label='实际数据直方图', rwidth=0.9) pdf_lines = [] for name, (r2, rmse, (pdf, _, linestyle, label)) in distributions.items(): linewidth = 3 if name == best_distribution else 3 # 修改:图例中只显示分布名称,不显示R² line, = ax1.plot(x_range, pdf, linestyle, lw=linewidth, label=f'{label}') pdf_lines.append(line) ax1.set_ylabel('概率密度', fontsize=28, color='black') ax1.set_xlim(x_min, x_max) # 修改3: 调整X轴刻度 ax1.set_xticks(np.arange(x_min, x_max + 2, 2)) # 每2度一个刻度 for tick in ax1.get_xticklabels(): tick.set_fontproperties('Times New Roman') tick.set_fontsize(28) for tick in ax1.get_yticklabels(): tick.set_fontproperties('Times New Roman') tick.set_fontsize(28) ax1.grid(False) # 4. 绘制累积分布函数对比图 ax2 = ax1.twinx() sorted_data = np.sort(data) ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) ax2.plot(sorted_data, ecdf, 'k.', alpha=0.6, label='经验累积分布') for name, (r2, rmse, (_, cdf, linestyle, _)) in distributions.items(): linewidth = 3 if name == best_distribution else 2 pdf_line = next(line for line in pdf_lines if line.get_label().startswith(name)) ax2.plot(x_range, cdf, color=pdf_line.get_color(), linestyle=pdf_line.get_linestyle(), lw=linewidth) ax2.set_ylabel('累积概率', fontsize=28, color='black') # 修改4: 调整右侧Y轴的X轴刻度 ax2.set_xticks(np.arange(x_min, x_max + 2, 2)) # 每2度一个刻度 ax2.set_ylim(0, 1.05) for tick in ax2.get_xticklabels(): tick.set_fontproperties('Times New Roman') tick.set_fontsize(28) for tick in ax2.get_yticklabels(): tick.set_fontproperties('Times New Roman') tick.set_fontsize(28) ax2.grid(False) # 5. 合并图例(此时图例已不包含R²) lines1, labels1 = ax1.get_legend_handles_labels() lines2, labels2 = ax2.get_legend_handles_labels() # 合并所有图例 ax1.legend(lines1 + [lines2[0]], labels1 + [labels2[0]], prop={'family': 'SimSun', 'size': 26}, loc='upper right', bbox_to_anchor=(0.32, 0.85), ncol=1, framealpha=0.9, frameon=False) # 关键修改:确保X轴标签完整显示 ax1.set_xlabel('温度 (°C)', fontsize=28) # 调整图表布局确保X轴标签完整显示 plt.subplots_adjust(bottom=0.15) # 增加底部边距 plt.tight_layout(rect=[0, 0.05, 1, 0.98]) # 明确指定绘图区域 plt.show() # 6. 输出评估结果(评估指标仍在控制台输出) print("\n### 分布参数 ###") print(f"GEV分布: 形状={c_gev:.4f}, 位置={loc_gev:.4f}, 尺度={scale_gev:.4f}") print(f"混合高斯分布: 成分数={gmm.n_components}") for i, (w, m, s) in enumerate(components): print(f" 成分 {i + 1}: 权重={w:.4f}, 均值={m:.4f}, 标准差={s:.4f}") gmm_mean, gmm_std = calculate_gmm_stats(gmm) print(f" 整体统计量: 均值={gmm_mean:.4f}, 标准差={gmm_std:.4f}") print(f"皮尔逊Ⅲ型分布: 偏度={skew_pearson3:.4f}, 位置={loc_pearson3:.4f}, 尺度={scale_pearson3:.4f}") print(f"正态分布: 均值={mu_norm:.4f}, 标准差={sigma_norm:.4f}") print("\n### 模型评估指标 ###") print(f"{'分布类型':<18}{'R²':<10}{'RMSE':<10}{'AIC':<10}{'BIC':<10}") print(f"GEV分布: {r2_gev:.4f} {rmse_gev:.4f} {aic_gev:.4f} {bic_gev:.4f}") print(f"混合高斯分布: {r2_gmm:.4f} {rmse_gmm:.4f} {aic_gmm:.4f} {bic_gmm:.4f}") print(f"皮尔逊Ⅲ型分布: {r2_pearson3:.4f} {rmse_pearson3:.4f} {aic_pearson3:.4f} {bic_pearson3:.4f}") print(f"正态分布: {r2_norm:.4f} {rmse_norm:.4f} {aic_norm:.4f} {bic_norm:.4f}") print("\n### 模型选择结论 ###") print(f"R²最优模型: {best_distribution} (={best_r2:.4f})") # AIC和BIC最优模型的输出逻辑 aic_results = {'GEV分布': aic_gev, '混合高斯分布': aic_gmm, '皮尔逊Ⅲ型分布': aic_pearson3, '正态分布': aic_norm} bic_results = {'GEV分布': bic_gev, '混合高斯分布': bic_gmm, '皮尔逊Ⅲ型分布': bic_pearson3, '正态分布': bic_norm} best_aic_model = min(aic_results, key=aic_results.get) best_bic_model = min(bic_results, key=bic_results.get) print(f"AIC最优模型: {best_aic_model} (AIC={aic_results[best_aic_model]:.4f})") print(f"BIC最优模型: {best_bic_model} (BIC={bic_results[best_bic_model]:.4f})") if __name__ == "__main__": file_path = r"C:\Users\汤伟娟\Desktop\temperature1.xlsx" # 请根据实际文件路径修改 sheet_name = 'Sheet1' column_name = 'Temperature (°C)' temperatures = load_temperature_data(file_path, sheet_name, column_name) plot_distribution_fits(temperatures) 修改代码,将x轴标签,温度用宋体,℃用新罗马
最新发布
07-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值