import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
# ==================== 修复中文字体问题 ====================
# 设置matplotlib中文字体
try:
# Windows系统
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
# Mac系统
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']
# Linux系统
# plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei']
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
except:
print("注意:中文字体设置可能失败,图表中的中文可能显示为方框")
# ==================== 1. 辅助函数定义 ====================
def halton_sequence(n, base):
"""生成Halton序列"""
sequence = []
for i in range(1, n + 1):
f = 1.0
r = 0.0
num = i
while num > 0:
f /= base
r += f * (num % base)
num = num // base
sequence.append(r)
return np.array(sequence)
def generate_halton_draws(n_obs, n_draws, n_params):
"""生成Halton随机数"""
draws = np.zeros((n_obs, n_draws, n_params))
primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37]
for p in range(n_params):
base = primes[p % len(primes)]
seq = halton_sequence(n_draws + 100, base)[100:]
draws[:, :, p] = norm.ppf(seq).reshape(1, -1)
return draws
# ==================== 2. RandomParametersLogit类定义 ====================
class RandomParametersLogit:
def __init__(self, n_draws=200, method='halton'):
self.n_draws = n_draws
self.method = method
self.beta_means = None
self.beta_std = None
self.fitted = False
self.feature_names = None
self.choice_names = None
def _generate_random_parameters(self, n_obs, beta_mean, beta_std):
"""生成随机参数抽样"""
n_params = len(beta_mean)
if self.method == 'halton':
draws = generate_halton_draws(n_obs, self.n_draws, n_params)
else:
draws = np.random.randn(n_obs, self.n_draws, n_params)
random_params = beta_mean.reshape(1, 1, -1) + beta_std.reshape(1, 1, -1) * draws
return random_params
def _compute_probabilities(self, X, y, beta_mean, beta_std):
"""计算模拟概率和负对数似然"""
n_obs, n_choices, n_features = X.shape
# 生成随机参数
beta_random = self._generate_random_parameters(n_obs, beta_mean, beta_std)
# 计算效用
utilities = np.zeros((n_obs, self.n_draws, n_choices))
for r in range(self.n_draws):
beta_r = beta_random[:, r, :]
for i in range(n_choices):
utilities[:, r, i] = np.sum(beta_r * X[:, i, :], axis=1)
# 计算Logit概率(数值稳定版本)
max_utility = np.max(utilities, axis=2, keepdims=True)
exp_utility = np.exp(utilities - max_utility)
sum_exp = np.sum(exp_utility, axis=2, keepdims=True)
prob_draws = exp_utility / sum_exp
# 模拟概率
simulated_probs = np.mean(prob_draws, axis=1)
# 获取每个观测实际选择的概率
chosen_probs = simulated_probs[np.arange(n_obs), y]
chosen_probs = np.clip(chosen_probs, 1e-15, 1)
# 计算负对数似然
neg_ll = -np.sum(np.log(chosen_probs))
return neg_ll, simulated_probs
def fit(self, X, y, feature_names=None, choice_names=None, init_params=None, verbose=True):
"""估计模型参数"""
n_obs, n_choices, n_features = X.shape
if feature_names is None:
self.feature_names = [f'特征_{i+1}' for i in range(n_features)]
else:
self.feature_names = feature_names
if choice_names is not None:
self.choice_names = choice_names
if verbose:
print(f"开始拟合随机参数Logit模型...")
print(f"观测数: {n_obs}, 选择数: {n_choices}, 特征数: {n_features}")
print(f"蒙特卡罗抽样次数: {self.n_draws}")
# 初始参数
if init_params is None:
init_params = np.concatenate([
np.random.randn(n_features) * 0.1,
np.log(np.ones(n_features) * 0.5)
])
# 定义目标函数
def objective(params):
beta_mean = params[:n_features]
beta_std = np.exp(params[n_features:])
neg_ll, _ = self._compute_probabilities(X, y, beta_mean, beta_std)
return neg_ll
# 优化
bounds = [(None, None)] * n_features + [(-10, 10)] * n_features
result = minimize(
objective,
init_params,
method='L-BFGS-B',
bounds=bounds,
options={'maxiter': 1000, 'ftol': 1e-6, 'disp': verbose}
)
# 保存结果
self.beta_means = result.x[:n_features]
self.beta_std = np.exp(result.x[n_features:])
self.fitted = True
self.result = result
# 计算最终概率
_, self.final_probs = self._compute_probabilities(X, y, self.beta_means, self.beta_std)
if verbose:
print("\n优化完成!")
print(f"优化状态: {result.success}")
print(f"最终负对数似然值: {result.fun:.4f}")
return result
def predict_probabilities(self, X):
"""预测选择概率"""
if not self.fitted:
raise ValueError("模型未拟合,请先调用fit方法")
n_obs, n_choices, n_features = X.shape
beta_random = self._generate_random_parameters(n_obs, self.beta_means, self.beta_std)
utilities = np.zeros((n_obs, self.n_draws, n_choices))
for r in range(self.n_draws):
beta_r = beta_random[:, r, :]
for i in range(n_choices):
utilities[:, r, i] = np.sum(beta_r * X[:, i, :], axis=1)
max_utility = np.max(utilities, axis=2, keepdims=True)
exp_utility = np.exp(utilities - max_utility)
sum_exp = np.sum(exp_utility, axis=2, keepdims=True)
prob_draws = exp_utility / sum_exp
simulated_probs = np.mean(prob_draws, axis=1)
return simulated_probs
def get_parameters(self):
"""获取参数"""
if not self.fitted:
raise ValueError("模型未拟合")
return {
'beta_means': self.beta_means,
'beta_std': self.beta_std,
'feature_names': self.feature_names
}
def summary(self):
"""输出模型摘要"""
if not self.fitted:
print("模型未拟合")
return
print("=" * 70)
print("随机参数Logit模型结果摘要")
print("=" * 70)
print(f"\n优化信息:")
print(f" 状态: {self.result.success}")
print(f" 消息: {self.result.message}")
print(f" 迭代次数: {self.result.nit}")
print(f" 负对数似然: {self.result.fun:.4f}")
print(f" 对数似然: {-self.result.fun:.4f}")
print(f"\n参数估计 (随机参数):")
print("-" * 60)
print(f"{'特征':<25} {'均值':<15} {'标准差':<15} {'显著性':<15}")
print("-" * 60)
for i in range(len(self.beta_means)):
# 计算t统计量(近似)
t_stat = abs(self.beta_means[i] / (self.beta_std[i] / np.sqrt(self.n_draws)))
# 判断显著性(基于t统计量)
if t_stat > 2.58:
sig = '***'
elif t_stat > 1.96:
sig = '**'
elif t_stat > 1.645:
sig = '*'
else:
sig = ''
print(f"{self.feature_names[i]:<25} {self.beta_means[i]:<15.4f} {self.beta_std[i]:<15.4f} {sig:<15}")
print("\n显著性标记: * p<0.1, ** p<0.05, *** p<0.01")
print("=" * 70)
# ==================== 3. 数据加载和预处理 ====================
def load_actual_data(file_path='编码数据.xlsx'):
"""加载实际Excel数据"""
print(f"加载Excel数据: {file_path}")
try:
# 加载sheet1(原始数据)
df_data = pd.read_excel(file_path, sheet_name=0)
print(f"Sheet1 - 原始数据形状: {df_data.shape}")
print(f"Sheet1 - 数据列: {df_data.columns.tolist()}")
# 加载sheet2(编码信息)
df_codes = pd.read_excel(file_path, sheet_name=1)
print(f"\nSheet2 - 编码信息形状: {df_codes.shape}")
# 解析编码信息
coding_info = {}
current_variable = None
for idx, row in df_codes.iterrows():
if pd.notna(row.iloc[0]) and row.iloc[0] != '':
# 新变量开始
current_variable = str(row.iloc[0]).strip()
coding_info[current_variable] = {}
# 检查是否有第一个编码
if pd.notna(row.iloc[1]) and row.iloc[1] != '':
code = int(row.iloc[1])
desc = str(row.iloc[2]) if pd.notna(row.iloc[2]) else "未知"
coding_info[current_variable][code] = desc
else:
# 继续当前变量的编码
if current_variable and pd.notna(row.iloc[1]) and row.iloc[1] != '':
code = int(row.iloc[1])
desc = str(row.iloc[2]) if pd.notna(row.iloc[2]) else "未知"
coding_info[current_variable][code] = desc
print(f"\n解析出的变量编码数量: {len(coding_info)}")
return df_data, coding_info
except Exception as e:
print(f"加载数据时出错: {e}")
print("使用示例数据替代...")
return load_sample_data(), get_sample_coding_info()
def load_sample_data():
"""生成示例数据"""
print("使用示例数据")
# 创建更符合实际的示例数据(100个观测)
np.random.seed(42)
n_samples = 100
data = {
'Report ID': [f'AV_Accident_{i:03d}' for i in range(1, n_samples+1)],
'Highest injured severity': np.random.choice([0, 1, 2, 3], n_samples, p=[0.6, 0.25, 0.1, 0.05]),
'Typr of collision': np.random.choice([0, 1, 2, 3], n_samples, p=[0.4, 0.3, 0.2, 0.1]),
'Season': np.random.choice([0, 1, 2, 3], n_samples, p=[0.25, 0.25, 0.25, 0.25]),
'Weekdays Mon-Fri': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),
'Rush Hour or not': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]),
'Weather': np.random.choice([0, 1, 2, 3], n_samples, p=[0.5, 0.3, 0.15, 0.05]),
'Light condition': np.random.choice([0, 1, 2, 3], n_samples, p=[0.6, 0.1, 0.2, 0.1]),
'Roadway Surface': np.random.choice([0, 1, 2, 3], n_samples, p=[0.7, 0.2, 0.05, 0.05]),
'Roadway Conditions': np.random.choice([0, 1, 2, 3, 4], n_samples, p=[0.8, 0.05, 0.05, 0.05, 0.05])
}
df = pd.DataFrame(data)
return df
def get_sample_coding_info():
"""获取示例编码信息"""
coding_info = {
'Highest injured severity': {
0: '无伤害或轻微伤害',
1: '可能的伤害',
2: '明显的伤害',
3: '严重伤害/致命伤害'
},
'Typr of collision': {
0: '追尾',
1: '正面碰撞',
2: '侧面碰撞',
3: '其他'
},
'Season': {
0: '冬季',
1: '春季',
2: '夏季',
3: '秋季'
},
'Weekdays Mon-Fri': {
0: '周末',
1: '工作日'
},
'Rush Hour or not': {
0: '非高峰时段',
1: '高峰时段'
},
'Weather': {
0: '晴朗',
1: '多云',
2: '雨/雪',
3: '其他'
},
'Light condition': {
0: '白天',
1: '黎明/黄昏',
2: '夜晚(有照明)',
3: '夜晚(无照明)'
},
'Roadway Surface': {
0: '干燥',
1: '潮湿',
2: '积雪',
3: '结冰'
},
'Roadway Conditions': {
0: '正常',
1: '施工',
2: '障碍物',
3: '其他',
4: '未知'
}
}
return coding_info
def prepare_data_for_model(df, target_col='Highest injured severity', feature_cols=None):
"""
准备数据用于随机参数Logit模型
参数:
df: DataFrame
target_col: str, 目标变量列名
feature_cols: list, 特征列名列表
"""
print(f"\n准备数据用于模型...")
print(f"目标变量: {target_col}")
# 如果未指定特征列,使用所有数值列(除了Report ID和目标变量)
if feature_cols is None:
feature_cols = [col for col in df.columns
if col not in ['Report ID', target_col]
and pd.api.types.is_numeric_dtype(df[col])]
print(f"特征变量 ({len(feature_cols)}个): {feature_cols}")
# 检查数据质量
print(f"\n数据质量检查:")
print(f"总观测数: {len(df)}")
print(f"目标变量分布:")
print(df[target_col].value_counts().sort_index())
# 移除缺失值
df_clean = df.dropna(subset=[target_col] + feature_cols)
if len(df_clean) < len(df):
print(f"移除 {len(df) - len(df_clean)} 个包含缺失值的观测")
# 准备特征矩阵X和目标向量y
X_raw = df_clean[feature_cols].values
y_raw = df_clean[target_col].values
# 标准化特征(对收敛有帮助)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
# 获取选择类别
choice_categories = sorted(np.unique(y_raw))
n_choices = len(choice_categories)
n_obs = X_scaled.shape[0]
n_features = X_scaled.shape[1]
print(f"\n处理后的数据:")
print(f"有效观测数: {n_obs}")
print(f"选择类别数: {n_choices}")
print(f"选择类别: {choice_categories}")
print(f"特征数: {n_features}")
# 创建三维数组X: (n_obs, n_choices, n_features)
# 对于多项Logit,每个观测在每个选择下的特征相同
X = np.zeros((n_obs, n_choices, n_features))
for i in range(n_obs):
for j in range(n_choices):
X[i, j, :] = X_scaled[i, :]
# 将y映射到0到n_choices-1的范围
y_mapping = {category: idx for idx, category in enumerate(choice_categories)}
y = np.array([y_mapping[val] for val in y_raw])
print(f"最终X形状: {X.shape}")
print(f"最终y形状: {y.shape}")
return X, y, choice_categories, feature_cols, scaler, y_mapping
def interpret_coding_scheme(coding_info):
"""解释编码方案"""
print("\n" + "="*70)
print("数据编码方案解释")
print("="*70)
for var, codes in coding_info.items():
print(f"\n{var}:")
for code, desc in codes.items():
print(f" {code}: {desc}")
return coding_info
# ==================== 4. 运行随机参数模型 ====================
def run_random_parameters_model(X, y, feature_names, choice_categories, n_draws=100):
"""运行随机参数Logit模型"""
print("\n" + "="*70)
print("运行随机参数Logit模型")
print("="*70)
# 创建模型
model = RandomParametersLogit(n_draws=n_draws, method='halton')
print(f"\n模型配置:")
print(f" 观测数: {X.shape[0]}")
print(f" 选择数: {X.shape[1]}")
print(f" 特征数: {X.shape[2]}")
print(f" 蒙特卡罗抽样次数: {n_draws}")
# 拟合模型
print("\n开始模型拟合...")
result = model.fit(
X, y,
feature_names=feature_names,
choice_names=[str(c) for c in choice_categories],
verbose=True
)
# 输出摘要
print("\n" + "="*70)
model.summary()
# 计算模型评估指标
n_obs = X.shape[0]
n_params = len(model.beta_means) + len(model.beta_std)
# 零模型对数似然(所有选择概率相等)
null_ll = n_obs * np.log(1 / X.shape[1])
model_ll = -result.fun
# 伪R²
rho2 = 1 - (model_ll / null_ll)
adj_rho2 = 1 - ((model_ll - n_params) / null_ll)
# 信息准则
aic = 2 * n_params - 2 * model_ll
bic = n_params * np.log(n_obs) - 2 * model_ll
# 预测准确率
pred_probs = model.predict_probabilities(X)
pred_choices = np.argmax(pred_probs, axis=1)
accuracy = np.mean(pred_choices == y)
print("\n模型评估指标:")
print("-"*40)
print(f"零模型对数似然: {null_ll:.4f}")
print(f"模型对数似然: {model_ll:.4f}")
print(f"伪R² (McFadden's R²): {rho2:.4f}")
print(f"调整伪R²: {adj_rho2:.4f}")
print(f"AIC: {aic:.4f}")
print(f"BIC: {bic:.4f}")
print(f"预测准确率: {accuracy:.2%}")
print(f"参数数量: {n_params}")
metrics = {
'null_ll': null_ll,
'model_ll': model_ll,
'rho2': rho2,
'adj_rho2': adj_rho2,
'aic': aic,
'bic': bic,
'accuracy': accuracy,
'n_params': n_params
}
return model, metrics
# ==================== 5. 结果可视化和解释 ====================
def create_visualizations(model, metrics, feature_names, choice_categories, coding_info):
"""创建可视化图表"""
print("\n" + "="*70)
print("生成可视化图表")
print("="*70)
# 设置图表风格
plt.style.use('seaborn-v0_8-whitegrid')
# 创建图表
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('AV事故损伤程度随机参数Logit模型分析', fontsize=16, fontweight='bold')
# 1. 参数均值估计
x_pos = np.arange(len(feature_names))
colors = plt.cm.Set3(np.linspace(0, 1, len(feature_names)))
bars1 = axes[0, 0].bar(x_pos, model.beta_means, color=colors, edgecolor='black', alpha=0.8)
axes[0, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5, linewidth=1)
axes[0, 0].set_xlabel('特征变量', fontsize=12)
axes[0, 0].set_ylabel('参数均值', fontsize=12)
axes[0, 0].set_title('随机参数均值估计', fontsize=14, fontweight='bold')
axes[0, 0].set_xticks(x_pos)
axes[0, 0].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10)
# 添加数值标签
for bar, v in zip(bars1, model.beta_means):
height = bar.get_height()
axes[0, 0].text(bar.get_x() + bar.get_width()/2.,
height + (0.01 if height >= 0 else -0.03),
f'{v:.3f}', ha='center', va='bottom' if height >= 0 else 'top',
fontsize=9)
# 2. 参数标准差估计
bars2 = axes[0, 1].bar(x_pos, model.beta_std, color=colors, edgecolor='black', alpha=0.8)
axes[0, 1].set_xlabel('特征变量', fontsize=12)
axes[0, 1].set_ylabel('参数标准差', fontsize=12)
axes[0, 1].set_title('随机参数异质性(标准差)', fontsize=14, fontweight='bold')
axes[0, 1].set_xticks(x_pos)
axes[0, 1].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10)
# 添加数值标签
for bar, v in zip(bars2, model.beta_std):
height = bar.get_height()
axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
f'{v:.3f}', ha='center', va='bottom', fontsize=9)
# 3. 特征重要性(均值×标准差)
importance = np.abs(model.beta_means) * model.beta_std
sorted_idx = np.argsort(importance)[::-1]
sorted_features = [feature_names[i] for i in sorted_idx]
sorted_importance = importance[sorted_idx]
colors_sorted = [colors[i] for i in sorted_idx]
bars3 = axes[0, 2].barh(range(len(sorted_features)), sorted_importance,
color=colors_sorted, edgecolor='black', alpha=0.8)
axes[0, 2].set_yticks(range(len(sorted_features)))
axes[0, 2].set_yticklabels(sorted_features, fontsize=10)
axes[0, 2].set_xlabel('特征重要性(|均值| × 标准差)', fontsize=12)
axes[0, 2].set_title('特征重要性排名', fontsize=14, fontweight='bold')
# 4. 参数不确定性(误差条形图)
axes[1, 0].errorbar(x_pos, model.beta_means, yerr=model.beta_std,
fmt='o', color='darkblue', alpha=0.7,
capsize=5, capthick=2, markersize=8)
axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('特征变量', fontsize=12)
axes[1, 0].set_ylabel('参数值(均值±标准差)', fontsize=12)
axes[1, 0].set_title('参数估计的不确定性', fontsize=14, fontweight='bold')
axes[1, 0].set_xticks(x_pos)
axes[1, 0].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10)
# 5. 模型拟合指标
metric_names = ['伪R²', '调整伪R²', '预测准确率', 'AIC/1000']
metric_values = [
metrics['rho2'],
metrics['adj_rho2'],
metrics['accuracy'],
metrics['aic'] / 1000
]
bars5 = axes[1, 1].bar(metric_names, metric_values,
color=['#4C72B0', '#55A868', '#C44E52', '#8172B2'],
alpha=0.8, edgecolor='black')
axes[1, 1].set_ylabel('值', fontsize=12)
axes[1, 1].set_title('模型拟合指标', fontsize=14, fontweight='bold')
# 添加数值标签
for bar, v in zip(bars5, metric_values):
height = bar.get_height()
axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01,
f'{v:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold')
# 6. 参数分布模拟(最重要的3个特征)
top_3_idx = sorted_idx[:3]
# 生成模拟分布
n_sim = 1000
for i, idx in enumerate(top_3_idx):
# 模拟参数分布
simulated = np.random.normal(model.beta_means[idx], model.beta_std[idx], n_sim)
# 绘制核密度估计
sns.kdeplot(simulated, ax=axes[1, 2], label=feature_names[idx],
linewidth=2, alpha=0.7)
axes[1, 2].axvline(x=0, color='r', linestyle='--', alpha=0.5, linewidth=1)
axes[1, 2].set_xlabel('参数值', fontsize=12)
axes[1, 2].set_ylabel('密度', fontsize=12)
axes[1, 2].set_title('Top 3特征参数分布', fontsize=14, fontweight='bold')
axes[1, 2].legend(fontsize=10)
plt.tight_layout()
plt.savefig('av_accident_random_params_results.png', dpi=300, bbox_inches='tight')
plt.show()
print("图表已保存为 'av_accident_random_params_results.png'")
# 创建详细分析图表
fig2, axes2 = plt.subplots(1, 2, figsize=(15, 6))
# 散点图:均值 vs 标准差
scatter = axes2[0].scatter(model.beta_means, model.beta_std,
c=importance, s=150, alpha=0.7,
cmap='viridis', edgecolor='black', linewidth=0.5)
# 添加特征标签
for i, (x_val, y_val) in enumerate(zip(model.beta_means, model.beta_std)):
axes2[0].text(x_val, y_val, feature_names[i],
fontsize=9, ha='center', va='bottom',
bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7))
axes2[0].axvline(x=0, color='r', linestyle='--', alpha=0.5, linewidth=1)
axes2[0].set_xlabel('参数均值', fontsize=12)
axes2[0].set_ylabel('参数标准差', fontsize=12)
axes2[0].set_title('特征参数分布散点图', fontsize=14, fontweight='bold')
axes2[0].grid(True, alpha=0.3)
# 添加颜色条
cbar = plt.colorbar(scatter, ax=axes2[0])
cbar.set_label('特征重要性(|均值|×标准差)', fontsize=10)
# 解释图表
axes2[1].axis('off')
explanation_text = """
参数解释:
1. 参数均值:
- 正值:增加严重损伤的概率
- 负值:减少严重损伤的概率
2. 参数标准差:
- 衡量个体间异质性
- 值越大:特征影响在不同事故间差异越大
- 值越小:特征影响较为一致
3. 特征重要性:
- |均值| × 标准差
- 综合考虑了效应大小和异质性
重要发现:
"""
# 添加前3个最重要的特征
for i, idx in enumerate(top_3_idx[:3]):
feature = feature_names[idx]
mean_val = model.beta_means[idx]
std_val = model.beta_std[idx]
if mean_val > 0:
effect = "增加严重损伤概率"
else:
effect = "减少严重损伤概率"
axes2[1].text(0.05, 0.85 - i*0.15,
f"{i+1}. {feature}:\n"
f" 均值={mean_val:.3f}, 标准差={std_val:.3f}\n"
f" → {effect}",
fontsize=10,
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightyellow', alpha=0.8),
verticalalignment='top')
plt.tight_layout()
plt.savefig('av_accident_parameter_interpretation.png', dpi=300, bbox_inches='tight')
plt.show()
print("参数解释图表已保存为 'av_accident_parameter_interpretation.png'")
def interpret_results(model, feature_names, coding_info):
"""解释模型结果"""
print("\n" + "="*70)
print("模型结果解释")
print("="*70)
# 计算特征重要性
importance = np.abs(model.beta_means) * model.beta_std
sorted_idx = np.argsort(importance)[::-1]
print("\n最重要的特征(按重要性排序):")
print("-"*60)
for rank, idx in enumerate(sorted_idx):
feature = feature_names[idx]
mean_val = model.beta_means[idx]
std_val = model.beta_std[idx]
imp_val = importance[idx]
# 解释符号
if mean_val > 0:
effect = "增加严重损伤的概率"
sign = "+"
else:
effect = "减少严重损伤的概率"
sign = "-"
# 解释异质性
if std_val > 0.5:
heterogeneity = "(个体间影响差异较大)"
elif std_val > 0.2:
heterogeneity = "(个体间影响有一定差异)"
else:
heterogeneity = "(个体间影响较为一致)"
print(f"\n{rank+1}. {feature}:")
print(f" 参数均值: {mean_val:.4f} ({sign})")
print(f" 参数标准差: {std_val:.4f} {heterogeneity}")
print(f" 重要性: {imp_val:.4f}")
print(f" 解释: {effect}")
# 如果该特征在编码信息中,提供更详细的解释
if feature in coding_info:
print(f" 编码说明:")
for code, desc in coding_info[feature].items():
print(f" {code}: {desc}")
print("\n" + "-"*60)
print("政策建议:")
# 基于最重要的特征提出建议
top_features = [feature_names[i] for i in sorted_idx[:3]]
for i, feature in enumerate(top_features):
if 'collision' in feature.lower() or '碰撞' in feature:
print(f"{i+1}. 碰撞类型:针对常见碰撞类型优化AV的避障算法")
elif 'Rush' in feature or '高峰' in feature:
print(f"{i+1}. 高峰时段:在高峰时段加强AV监控和调度")
elif 'Weather' in feature or '天气' in feature:
print(f"{i+1}. 天气条件:恶劣天气下可能需要降低AV行驶速度或调整路线")
elif 'Light' in feature or '光照' in feature:
print(f"{i+1}. 光照条件:夜间或低光照条件下需要改进AV的感知系统")
elif 'Surface' in feature or '路面' in feature:
print(f"{i+1}. 路面状况:潮湿或结冰路面需要调整AV的制动策略")
elif 'Season' in feature or '季节' in feature:
print(f"{i+1}. 季节因素:根据不同季节调整AV的行驶策略")
else:
print(f"{i+1}. {feature}:需要进一步分析该因素的影响机制")
print("\n异质性分析:")
high_hetero = np.sum(model.beta_std > 0.5)
med_hetero = np.sum((model.beta_std > 0.2) & (model.beta_std <= 0.5))
print(f" - 高异质性特征(标准差>0.5): {high_hetero}个")
print(f" - 中等异质性特征(0.2<标准差≤0.5): {med_hetero}个")
print(f" - 低异质性特征(标准差≤0.2): {len(model.beta_std) - high_hetero - med_hetero}个")
if high_hetero > 0:
print(f"\n建议对以下高异质性特征进行进一步细分分析:")
for idx in np.where(model.beta_std > 0.5)[0]:
print(f" - {feature_names[idx]} (标准差: {model.beta_std[idx]:.3f})")
def save_results(model, metrics, feature_names, choice_categories, coding_info, y_actual):
"""保存结果到Excel"""
print("\n" + "="*70)
print("保存结果到Excel")
print("="*70)
# 创建结果DataFrame
results_df = pd.DataFrame({
'特征': feature_names,
'参数均值': model.beta_means,
'参数标准差': model.beta_std,
't统计量(近似)': model.beta_means / (model.beta_std / np.sqrt(model.n_draws)),
'重要性(|均值|×标准差)': np.abs(model.beta_means) * model.beta_std,
'显著性': ['***' if abs(t) > 2.58 else '**' if abs(t) > 1.96 else '*' if abs(t) > 1.645 else ''
for t in model.beta_means / (model.beta_std / np.sqrt(model.n_draws))]
})
# 按重要性排序
results_df = results_df.sort_values('重要性(|均值|×标准差)', ascending=False)
# 创建模型评估DataFrame
metrics_df = pd.DataFrame({
'指标': ['观测数', '选择类别数', '特征数', '蒙特卡罗抽样次数',
'负对数似然', '对数似然', '伪R² (McFadden)', '调整伪R²',
'AIC', 'BIC', '预测准确率', '参数数量'],
'值': [metrics.get('n_obs', 'N/A'), len(choice_categories), len(feature_names),
model.n_draws, model.result.fun, -model.result.fun,
metrics['rho2'], metrics['adj_rho2'], metrics['aic'],
metrics['bic'], metrics['accuracy'], metrics['n_params']]
})
# 创建编码说明DataFrame
coding_dfs = []
for var, codes in coding_info.items():
if var in feature_names or 'injured' in var.lower():
temp_df = pd.DataFrame({
'变量': [var] * len(codes),
'编码': list(codes.keys()),
'含义': list(codes.values())
})
coding_dfs.append(temp_df)
coding_df = pd.concat(coding_dfs, ignore_index=True) if coding_dfs else pd.DataFrame()
# 保存预测概率
if hasattr(model, 'final_probs'):
prob_df = pd.DataFrame(model.final_probs,
columns=[f'P(损伤程度={c})' for c in choice_categories])
prob_df['实际损伤程度'] = y_actual
prob_df['预测损伤程度'] = np.argmax(model.final_probs, axis=1)
prob_df['预测正确'] = prob_df['预测损伤程度'] == prob_df['实际损伤程度']
# 保存到Excel
output_file = 'av_accident_random_params_analysis.xlsx'
with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
results_df.to_excel(writer, sheet_name='参数估计', index=False)
metrics_df.to_excel(writer, sheet_name='模型评估', index=False)
if not coding_df.empty:
coding_df.to_excel(writer, sheet_name='编码说明', index=False)
# 保存预测概率
if hasattr(model, 'final_probs'):
prob_df.to_excel(writer, sheet_name='预测概率', index=False)
print(f"结果已保存到: {output_file}")
return output_file
# ==================== 6. 主程序 ====================
def main():
"""主函数"""
print("="*70)
print("AV事故损伤程度随机参数Logit模型分析")
print("基于实际数据")
print("="*70)
# 1. 加载实际数据
print("\n[1] 加载数据...")
data_file = '编码数据.xlsx' # 修改为您的实际文件名
if os.path.exists(data_file):
df, coding_info = load_actual_data(data_file)
else:
print(f"文件 {data_file} 不存在,使用示例数据")
df = load_sample_data()
coding_info = get_sample_coding_info()
# 2. 解释编码方案
print("\n[2] 解释编码方案...")
interpret_coding_scheme(coding_info)
# 3. 准备数据
print("\n[3] 准备模型数据...")
# 选择特征 - 根据您的数据调整
feature_cols = [
'Typr of collision',
'Season',
'Weekdays Mon-Fri',
'Rush Hour or not',
'Weather',
'Light condition',
'Roadway Surface',
'Roadway Conditions'
]
# 检查哪些特征在实际数据中可用
available_features = [col for col in feature_cols if col in df.columns]
if len(available_features) < len(feature_cols):
print(f"警告:部分特征在数据中不存在,使用可用特征:{available_features}")
feature_cols = available_features
X, y, choice_categories, feature_names, scaler, y_mapping = prepare_data_for_model(
df,
target_col='Highest injured severity',
feature_cols=feature_cols
)
# 4. 运行随机参数模型
print("\n[4] 运行随机参数Logit模型...")
# 设置蒙特卡罗抽样次数(根据数据量调整)
if X.shape[0] < 50:
n_draws = 50
elif X.shape[0] < 200:
n_draws = 100
else:
n_draws = 200
print(f"使用 {n_draws} 次蒙特卡罗抽样")
model, metrics = run_random_parameters_model(
X, y, feature_names, choice_categories, n_draws=n_draws
)
# 5. 结果解释
print("\n[5] 结果解释...")
interpret_results(model, feature_names, coding_info)
# 6. 可视化
print("\n[6] 生成可视化图表...")
create_visualizations(model, metrics, feature_names, choice_categories, coding_info)
# 7. 保存结果
print("\n[7] 保存结果...")
# 添加观测数到metrics中
metrics['n_obs'] = X.shape[0]
# 获取实际y值(原始编码)
y_actual_original = df['Highest injured severity'].values
output_file = save_results(model, metrics, feature_names, choice_categories, coding_info, y_actual_original)
# 8. 总结
print("\n" + "="*70)
print("分析完成!")
print("="*70)
print(f"\n输出文件:")
print(f" 1. {output_file} - Excel格式的详细结果")
print(f" 2. av_accident_random_params_results.png - 主要可视化图表")
print(f" 3. av_accident_parameter_interpretation.png - 参数解释图表")
print("\n后续步骤建议:")
print(" 1. 检查模型收敛性,可能需要增加蒙特卡罗抽样次数")
print(" 2. 尝试不同的特征组合和变量转换")
print(" 3. 考虑加入交互项或非线性效应")
print(" 4. 对高异质性特征进行分组分析")
print(" 5. 验证模型在独立样本上的表现")
if __name__ == "__main__":
main()代码结果图中文字显示为口口口口,修改
最新发布