import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import genextreme, pearson3, norm
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error
import scipy.stats as stats
# 设置中文字体支持
plt.rcParams["font.family"] = ["SimSun", "SimHei", "WenQuanYi Micro Hei", "Heiti TC"]
plt.rcParams["axes.unicode_minus"] = False
def load_temperature_data(file_path, sheet_name, column_name):
"""加载温度数据并过滤缺失值"""
data = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
return data[column_name].dropna().values
def gev_fit(data):
"""广义极值分布拟合"""
c, loc, scale = genextreme.fit(data)
return c, loc, scale
def gaussian_mixture_fit(data, n_components=2):
"""混合高斯分布拟合"""
data = data.reshape(-1, 1)
gmm = GaussianMixture(n_components=n_components, random_state=42)
gmm.fit(data)
return gmm
def pearson3_fit(data):
"""皮尔逊Ⅲ型分布拟合"""
skew, loc, scale = pearson3.fit(data)
return skew, loc, scale
def normal_fit(data):
"""正态分布拟合"""
mu, sigma = norm.fit(data)
return mu, sigma
def calculate_rmse(data, dist_func, *params):
"""计算拟合分布的RMSE"""
x = np.sort(data)
ecdf = np.arange(1, len(x) + 1) / len(x)
cdf = dist_func(x, *params)
return np.sqrt(mean_squared_error(ecdf, cdf))
def calculate_r2(data, dist_func, *params):
"""计算拟合分布的R²值(决定系数)"""
x = np.sort(data)
ecdf = np.arange(1, len(x) + 1) / len(x)
cdf = dist_func(x, *params)
# 计算总平方和(SS_tot)和残差平方和(SS_res)
y_mean = np.mean(ecdf)
ss_tot = np.sum((ecdf - y_mean) ** 2)
ss_res = np.sum((ecdf - cdf) ** 2)
# 计算R²(避免除零错误)
if ss_tot == 0:
return 1.0 # 所有数据点相同,R²=1
return 1 - (ss_res / ss_tot)
def calculate_gmm_stats(gmm):
"""计算混合高斯分布的整体均值和标准差"""
weights = gmm.weights_
means = gmm.means_.flatten()
stds = np.sqrt(gmm.covariances_.flatten())
overall_mean = np.sum(weights * means)
overall_variance = np.sum(weights * (stds ** 2 + means ** 2)) - overall_mean ** 2
return overall_mean, np.sqrt(overall_variance)
def calculate_aic_bic(data, log_likelihood, n_params):
"""计算AIC和BIC"""
n = len(data)
aic = 2 * n_params - 2 * log_likelihood
bic = n_params * np.log(n) - 2 * log_likelihood
return aic, bic
def plot_distribution_fits(data):
"""绘制多种分布拟合对比图,包含R²检验"""
fig, ax1 = plt.subplots(figsize=(14, 10))
# 修改1: 将X轴范围设置为-18到-8
x_min, x_max = -18, -6
x_range = np.linspace(x_min, x_max, 300)
# 1. 拟合各种分布并计算评估指标
# GEV分布
c_gev, loc_gev, scale_gev = gev_fit(data)
pdf_gev = genextreme.pdf(x_range, c_gev, loc=loc_gev, scale=scale_gev)
cdf_gev = genextreme.cdf(x_range, c_gev, loc=loc_gev, scale=scale_gev)
rmse_gev = calculate_rmse(data, lambda x: genextreme.cdf(x, c_gev, loc=loc_gev, scale=scale_gev))
r2_gev = calculate_r2(data, lambda x: genextreme.cdf(x, c_gev, loc=loc_gev, scale=scale_gev))
log_likelihood_gev = np.sum(genextreme.logpdf(data, c_gev, loc=loc_gev, scale=scale_gev))
aic_gev, bic_gev = calculate_aic_bic(data, log_likelihood_gev, 3)
# 混合高斯分布
gmm = gaussian_mixture_fit(data)
components = [(gmm.weights_[i], gmm.means_[i][0], np.sqrt(gmm.covariances_[i][0][0]))
for i in range(gmm.n_components)]
# 计算混合高斯PDF/CDF
pdf_gmm = np.sum([w * norm.pdf(x_range, m, s) for w, m, s in components], axis=0)
cdf_gmm = np.sum([w * norm.cdf(x_range, m, s) for w, m, s in components], axis=0)
rmse_gmm = calculate_rmse(data, lambda x: np.sum([w * norm.cdf(x, m, s) for w, m, s in components], axis=0))
r2_gmm = calculate_r2(data, lambda x: np.sum([w * norm.cdf(x, m, s) for w, m, s in components], axis=0))
log_likelihood_gmm = gmm.score(data.reshape(-1, 1)) * len(data)
aic_gmm, bic_gmm = calculate_aic_bic(data, log_likelihood_gmm, gmm.n_components * 3 - 1)
# 皮尔逊Ⅲ型分布
skew_pearson3, loc_pearson3, scale_pearson3 = pearson3_fit(data)
pdf_pearson3 = pearson3.pdf(x_range, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3)
cdf_pearson3 = pearson3.cdf(x_range, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3)
rmse_pearson3 = calculate_rmse(data,
lambda x: pearson3.cdf(x, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3))
r2_pearson3 = calculate_r2(data, lambda x: pearson3.cdf(x, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3))
log_likelihood_pearson3 = np.sum(pearson3.logpdf(data, skew_pearson3, loc=loc_pearson3, scale=scale_pearson3))
aic_pearson3, bic_pearson3 = calculate_aic_bic(data, log_likelihood_pearson3, 3)
# 正态分布
mu_norm, sigma_norm = normal_fit(data)
pdf_norm = norm.pdf(x_range, mu_norm, sigma_norm)
cdf_norm = norm.cdf(x_range, mu_norm, sigma_norm)
rmse_norm = calculate_rmse(data, lambda x: norm.cdf(x, mu_norm, sigma_norm))
r2_norm = calculate_r2(data, lambda x: norm.cdf(x, mu_norm, sigma_norm))
log_likelihood_norm = np.sum(norm.logpdf(data, mu_norm, sigma_norm))
aic_norm, bic_norm = calculate_aic_bic(data, log_likelihood_norm, 2)
# 2. 定义分布字典并确定最优分布(R²优先,R²相同时RMSE优先)
distributions = {
'GEV分布': (r2_gev, rmse_gev, (pdf_gev, cdf_gev, 'r-', 'GEV分布')),
'混合高斯分布': (r2_gmm, rmse_gmm, (pdf_gmm, cdf_gmm, 'g-', '混合高斯分布')),
'皮尔逊Ⅲ型分布': (r2_pearson3, rmse_pearson3, (pdf_pearson3, cdf_pearson3, 'b-', '皮尔逊Ⅲ型分布')),
'正态分布': (r2_norm, rmse_norm, (pdf_norm, cdf_norm, 'm-', '正态分布'))
}
# 按R²降序、RMSE升序排序
best_distribution = max(distributions.items(),
key=lambda x: (x[1][0], -x[1][1]))[0]
best_r2, best_rmse, (best_pdf, best_cdf, best_linestyle, best_label) = distributions[best_distribution]
# 3. 绘制概率密度函数对比图
# 修改2: 调整直方图bins范围
bins = np.arange(x_min, x_max + 0.5, 0.5)
ax1.hist(data, bins=bins, density=True, alpha=0.3,
color='skyblue', edgecolor='black', label='实际数据直方图', rwidth=0.9)
pdf_lines = []
for name, (r2, rmse, (pdf, _, linestyle, label)) in distributions.items():
linewidth = 3 if name == best_distribution else 3
# 修改:图例中只显示分布名称,不显示R²
line, = ax1.plot(x_range, pdf, linestyle, lw=linewidth,
label=f'{label}')
pdf_lines.append(line)
ax1.set_ylabel('概率密度', fontsize=28, color='black')
ax1.set_xlim(x_min, x_max)
# 修改3: 调整X轴刻度
ax1.set_xticks(np.arange(x_min, x_max + 2, 2)) # 每2度一个刻度
for tick in ax1.get_xticklabels():
tick.set_fontproperties('Times New Roman')
tick.set_fontsize(28)
for tick in ax1.get_yticklabels():
tick.set_fontproperties('Times New Roman')
tick.set_fontsize(28)
ax1.grid(False)
# 4. 绘制累积分布函数对比图
ax2 = ax1.twinx()
sorted_data = np.sort(data)
ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
ax2.plot(sorted_data, ecdf, 'k.', alpha=0.6, label='经验累积分布')
for name, (r2, rmse, (_, cdf, linestyle, _)) in distributions.items():
linewidth = 3 if name == best_distribution else 2
pdf_line = next(line for line in pdf_lines if line.get_label().startswith(name))
ax2.plot(x_range, cdf, color=pdf_line.get_color(), linestyle=pdf_line.get_linestyle(), lw=linewidth)
ax2.set_ylabel('累积概率', fontsize=28, color='black')
# 修改4: 调整右侧Y轴的X轴刻度
ax2.set_xticks(np.arange(x_min, x_max + 2, 2)) # 每2度一个刻度
ax2.set_ylim(0, 1.05)
for tick in ax2.get_xticklabels():
tick.set_fontproperties('Times New Roman')
tick.set_fontsize(28)
for tick in ax2.get_yticklabels():
tick.set_fontproperties('Times New Roman')
tick.set_fontsize(28)
ax2.grid(False)
# 5. 合并图例(此时图例已不包含R²)
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
# 合并所有图例
ax1.legend(lines1 + [lines2[0]], labels1 + [labels2[0]],
prop={'family': 'SimSun', 'size': 26},
loc='upper right',
bbox_to_anchor=(0.32, 0.85),
ncol=1,
framealpha=0.9,
frameon=False)
# 关键修改:确保X轴标签完整显示
ax1.set_xlabel('温度 (°C)', fontsize=28)
# 调整图表布局确保X轴标签完整显示
plt.subplots_adjust(bottom=0.15) # 增加底部边距
plt.tight_layout(rect=[0, 0.05, 1, 0.98]) # 明确指定绘图区域
plt.show()
# 6. 输出评估结果(评估指标仍在控制台输出)
print("\n### 分布参数 ###")
print(f"GEV分布: 形状={c_gev:.4f}, 位置={loc_gev:.4f}, 尺度={scale_gev:.4f}")
print(f"混合高斯分布: 成分数={gmm.n_components}")
for i, (w, m, s) in enumerate(components):
print(f" 成分 {i + 1}: 权重={w:.4f}, 均值={m:.4f}, 标准差={s:.4f}")
gmm_mean, gmm_std = calculate_gmm_stats(gmm)
print(f" 整体统计量: 均值={gmm_mean:.4f}, 标准差={gmm_std:.4f}")
print(f"皮尔逊Ⅲ型分布: 偏度={skew_pearson3:.4f}, 位置={loc_pearson3:.4f}, 尺度={scale_pearson3:.4f}")
print(f"正态分布: 均值={mu_norm:.4f}, 标准差={sigma_norm:.4f}")
print("\n### 模型评估指标 ###")
print(f"{'分布类型':<18}{'R²':<10}{'RMSE':<10}{'AIC':<10}{'BIC':<10}")
print(f"GEV分布: {r2_gev:.4f} {rmse_gev:.4f} {aic_gev:.4f} {bic_gev:.4f}")
print(f"混合高斯分布: {r2_gmm:.4f} {rmse_gmm:.4f} {aic_gmm:.4f} {bic_gmm:.4f}")
print(f"皮尔逊Ⅲ型分布: {r2_pearson3:.4f} {rmse_pearson3:.4f} {aic_pearson3:.4f} {bic_pearson3:.4f}")
print(f"正态分布: {r2_norm:.4f} {rmse_norm:.4f} {aic_norm:.4f} {bic_norm:.4f}")
print("\n### 模型选择结论 ###")
print(f"R²最优模型: {best_distribution} (R²={best_r2:.4f})")
# AIC和BIC最优模型的输出逻辑
aic_results = {'GEV分布': aic_gev, '混合高斯分布': aic_gmm, '皮尔逊Ⅲ型分布': aic_pearson3, '正态分布': aic_norm}
bic_results = {'GEV分布': bic_gev, '混合高斯分布': bic_gmm, '皮尔逊Ⅲ型分布': bic_pearson3, '正态分布': bic_norm}
best_aic_model = min(aic_results, key=aic_results.get)
best_bic_model = min(bic_results, key=bic_results.get)
print(f"AIC最优模型: {best_aic_model} (AIC={aic_results[best_aic_model]:.4f})")
print(f"BIC最优模型: {best_bic_model} (BIC={bic_results[best_bic_model]:.4f})")
if __name__ == "__main__":
file_path = r"C:\Users\汤伟娟\Desktop\temperature1.xlsx" # 请根据实际文件路径修改
sheet_name = 'Sheet1'
column_name = 'Temperature (°C)'
temperatures = load_temperature_data(file_path, sheet_name, column_name)
plot_distribution_fits(temperatures) 修改代码,将x轴标签,温度用宋体,℃用新罗马
最新发布