imp 内容显示乱码(??口口)

注意三者的字符

1.dmp文件,可用cat aiki.dmp |od -x|head -1|awk '{print $2 $3}'|cut -c 3-6  如结果:0345  再select nls_charset_name(to_number('0354','xxxx')) from dual; //45倒过来

SQL> select nls_charset_name(to_number('0354','xxxx')) from dual;

NLS_CHARSET_NAME(TO_NUMBER('0354','XXXX'
----------------------------------------
ZHS16GBK

2.客户端字符(linux):echo $NLS_LANG  (设为与服务端一样),可加在.bash_profile(export NLS_LANG=AMERICAN_AMERICA.ZHS16GBK)

3.服务端字符SQL> select name,value$ from props$ where name='NLS_CHARACTERSET';

NAME
------------------------------
VALUE$
--------------------------------------------------------------------------------
NLS_CHARACTERSET
ZHS16GBK


4.OK正常


备注:问号可按以上步骤处理,如果检查时发现出现口口,那么是你用的连接软件字符集没有设好,修改相应的即可;

如当用putty(utf-8)连接时出现这个问题,后改用securecrt即OK(字符集多);


import numpy as np import pandas as pd from scipy.optimize import minimize from scipy.stats import norm import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') import os # ==================== 修复中文字体问题 ==================== # 设置matplotlib中文字体 try: # Windows系统 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 # Mac系统 # plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] # Linux系统 # plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei'] plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 except: print("注意:中文字体设置可能失败,图表中的中文可能显示为方框") # ==================== 1. 辅助函数定义 ==================== def halton_sequence(n, base): """生成Halton序列""" sequence = [] for i in range(1, n + 1): f = 1.0 r = 0.0 num = i while num > 0: f /= base r += f * (num % base) num = num // base sequence.append(r) return np.array(sequence) def generate_halton_draws(n_obs, n_draws, n_params): """生成Halton随机数""" draws = np.zeros((n_obs, n_draws, n_params)) primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37] for p in range(n_params): base = primes[p % len(primes)] seq = halton_sequence(n_draws + 100, base)[100:] draws[:, :, p] = norm.ppf(seq).reshape(1, -1) return draws # ==================== 2. RandomParametersLogit类定义 ==================== class RandomParametersLogit: def __init__(self, n_draws=200, method='halton'): self.n_draws = n_draws self.method = method self.beta_means = None self.beta_std = None self.fitted = False self.feature_names = None self.choice_names = None def _generate_random_parameters(self, n_obs, beta_mean, beta_std): """生成随机参数抽样""" n_params = len(beta_mean) if self.method == 'halton': draws = generate_halton_draws(n_obs, self.n_draws, n_params) else: draws = np.random.randn(n_obs, self.n_draws, n_params) random_params = beta_mean.reshape(1, 1, -1) + beta_std.reshape(1, 1, -1) * draws return random_params def _compute_probabilities(self, X, y, beta_mean, beta_std): """计算模拟概率和负对数似然""" n_obs, n_choices, n_features = X.shape # 生成随机参数 beta_random = self._generate_random_parameters(n_obs, beta_mean, beta_std) # 计算效用 utilities = np.zeros((n_obs, self.n_draws, n_choices)) for r in range(self.n_draws): beta_r = beta_random[:, r, :] for i in range(n_choices): utilities[:, r, i] = np.sum(beta_r * X[:, i, :], axis=1) # 计算Logit概率(数值稳定版本) max_utility = np.max(utilities, axis=2, keepdims=True) exp_utility = np.exp(utilities - max_utility) sum_exp = np.sum(exp_utility, axis=2, keepdims=True) prob_draws = exp_utility / sum_exp # 模拟概率 simulated_probs = np.mean(prob_draws, axis=1) # 获取每个观测实际选择的概率 chosen_probs = simulated_probs[np.arange(n_obs), y] chosen_probs = np.clip(chosen_probs, 1e-15, 1) # 计算负对数似然 neg_ll = -np.sum(np.log(chosen_probs)) return neg_ll, simulated_probs def fit(self, X, y, feature_names=None, choice_names=None, init_params=None, verbose=True): """估计模型参数""" n_obs, n_choices, n_features = X.shape if feature_names is None: self.feature_names = [f'特征_{i+1}' for i in range(n_features)] else: self.feature_names = feature_names if choice_names is not None: self.choice_names = choice_names if verbose: print(f"开始拟合随机参数Logit模型...") print(f"观测数: {n_obs}, 选择数: {n_choices}, 特征数: {n_features}") print(f"蒙特卡罗抽样次数: {self.n_draws}") # 初始参数 if init_params is None: init_params = np.concatenate([ np.random.randn(n_features) * 0.1, np.log(np.ones(n_features) * 0.5) ]) # 定义目标函数 def objective(params): beta_mean = params[:n_features] beta_std = np.exp(params[n_features:]) neg_ll, _ = self._compute_probabilities(X, y, beta_mean, beta_std) return neg_ll # 优化 bounds = [(None, None)] * n_features + [(-10, 10)] * n_features result = minimize( objective, init_params, method='L-BFGS-B', bounds=bounds, options={'maxiter': 1000, 'ftol': 1e-6, 'disp': verbose} ) # 保存结果 self.beta_means = result.x[:n_features] self.beta_std = np.exp(result.x[n_features:]) self.fitted = True self.result = result # 计算最终概率 _, self.final_probs = self._compute_probabilities(X, y, self.beta_means, self.beta_std) if verbose: print("\n优化完成!") print(f"优化状态: {result.success}") print(f"最终负对数似然值: {result.fun:.4f}") return result def predict_probabilities(self, X): """预测选择概率""" if not self.fitted: raise ValueError("模型未拟合,请先调用fit方法") n_obs, n_choices, n_features = X.shape beta_random = self._generate_random_parameters(n_obs, self.beta_means, self.beta_std) utilities = np.zeros((n_obs, self.n_draws, n_choices)) for r in range(self.n_draws): beta_r = beta_random[:, r, :] for i in range(n_choices): utilities[:, r, i] = np.sum(beta_r * X[:, i, :], axis=1) max_utility = np.max(utilities, axis=2, keepdims=True) exp_utility = np.exp(utilities - max_utility) sum_exp = np.sum(exp_utility, axis=2, keepdims=True) prob_draws = exp_utility / sum_exp simulated_probs = np.mean(prob_draws, axis=1) return simulated_probs def get_parameters(self): """获取参数""" if not self.fitted: raise ValueError("模型未拟合") return { 'beta_means': self.beta_means, 'beta_std': self.beta_std, 'feature_names': self.feature_names } def summary(self): """输出模型摘要""" if not self.fitted: print("模型未拟合") return print("=" * 70) print("随机参数Logit模型结果摘要") print("=" * 70) print(f"\n优化信息:") print(f" 状态: {self.result.success}") print(f" 消息: {self.result.message}") print(f" 迭代次数: {self.result.nit}") print(f" 负对数似然: {self.result.fun:.4f}") print(f" 对数似然: {-self.result.fun:.4f}") print(f"\n参数估计 (随机参数):") print("-" * 60) print(f"{'特征':<25} {'均值':<15} {'标准差':<15} {'显著性':<15}") print("-" * 60) for i in range(len(self.beta_means)): # 计算t统计量(近似) t_stat = abs(self.beta_means[i] / (self.beta_std[i] / np.sqrt(self.n_draws))) # 判断显著性(基于t统计量) if t_stat > 2.58: sig = '***' elif t_stat > 1.96: sig = '**' elif t_stat > 1.645: sig = '*' else: sig = '' print(f"{self.feature_names[i]:<25} {self.beta_means[i]:<15.4f} {self.beta_std[i]:<15.4f} {sig:<15}") print("\n显著性标记: * p<0.1, ** p<0.05, *** p<0.01") print("=" * 70) # ==================== 3. 数据加载和预处理 ==================== def load_actual_data(file_path='编码数据.xlsx'): """加载实际Excel数据""" print(f"加载Excel数据: {file_path}") try: # 加载sheet1(原始数据) df_data = pd.read_excel(file_path, sheet_name=0) print(f"Sheet1 - 原始数据形状: {df_data.shape}") print(f"Sheet1 - 数据列: {df_data.columns.tolist()}") # 加载sheet2(编码信息) df_codes = pd.read_excel(file_path, sheet_name=1) print(f"\nSheet2 - 编码信息形状: {df_codes.shape}") # 解析编码信息 coding_info = {} current_variable = None for idx, row in df_codes.iterrows(): if pd.notna(row.iloc[0]) and row.iloc[0] != '': # 新变量开始 current_variable = str(row.iloc[0]).strip() coding_info[current_variable] = {} # 检查是否有第一个编码 if pd.notna(row.iloc[1]) and row.iloc[1] != '': code = int(row.iloc[1]) desc = str(row.iloc[2]) if pd.notna(row.iloc[2]) else "未知" coding_info[current_variable][code] = desc else: # 继续当前变量的编码 if current_variable and pd.notna(row.iloc[1]) and row.iloc[1] != '': code = int(row.iloc[1]) desc = str(row.iloc[2]) if pd.notna(row.iloc[2]) else "未知" coding_info[current_variable][code] = desc print(f"\n解析出的变量编码数量: {len(coding_info)}") return df_data, coding_info except Exception as e: print(f"加载数据时出错: {e}") print("使用示例数据替代...") return load_sample_data(), get_sample_coding_info() def load_sample_data(): """生成示例数据""" print("使用示例数据") # 创建更符合实际的示例数据(100个观测) np.random.seed(42) n_samples = 100 data = { 'Report ID': [f'AV_Accident_{i:03d}' for i in range(1, n_samples+1)], 'Highest injured severity': np.random.choice([0, 1, 2, 3], n_samples, p=[0.6, 0.25, 0.1, 0.05]), 'Typr of collision': np.random.choice([0, 1, 2, 3], n_samples, p=[0.4, 0.3, 0.2, 0.1]), 'Season': np.random.choice([0, 1, 2, 3], n_samples, p=[0.25, 0.25, 0.25, 0.25]), 'Weekdays Mon-Fri': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]), 'Rush Hour or not': np.random.choice([0, 1], n_samples, p=[0.6, 0.4]), 'Weather': np.random.choice([0, 1, 2, 3], n_samples, p=[0.5, 0.3, 0.15, 0.05]), 'Light condition': np.random.choice([0, 1, 2, 3], n_samples, p=[0.6, 0.1, 0.2, 0.1]), 'Roadway Surface': np.random.choice([0, 1, 2, 3], n_samples, p=[0.7, 0.2, 0.05, 0.05]), 'Roadway Conditions': np.random.choice([0, 1, 2, 3, 4], n_samples, p=[0.8, 0.05, 0.05, 0.05, 0.05]) } df = pd.DataFrame(data) return df def get_sample_coding_info(): """获取示例编码信息""" coding_info = { 'Highest injured severity': { 0: '无伤害或轻微伤害', 1: '可能的伤害', 2: '明显的伤害', 3: '严重伤害/致命伤害' }, 'Typr of collision': { 0: '追尾', 1: '正面碰撞', 2: '侧面碰撞', 3: '其他' }, 'Season': { 0: '冬季', 1: '春季', 2: '夏季', 3: '秋季' }, 'Weekdays Mon-Fri': { 0: '周末', 1: '工作日' }, 'Rush Hour or not': { 0: '非高峰时段', 1: '高峰时段' }, 'Weather': { 0: '晴朗', 1: '多云', 2: '雨/雪', 3: '其他' }, 'Light condition': { 0: '白天', 1: '黎明/黄昏', 2: '夜晚(有照明)', 3: '夜晚(无照明)' }, 'Roadway Surface': { 0: '干燥', 1: '潮湿', 2: '积雪', 3: '结冰' }, 'Roadway Conditions': { 0: '正常', 1: '施工', 2: '障碍物', 3: '其他', 4: '未知' } } return coding_info def prepare_data_for_model(df, target_col='Highest injured severity', feature_cols=None): """ 准备数据用于随机参数Logit模型 参数: df: DataFrame target_col: str, 目标变量列名 feature_cols: list, 特征列名列表 """ print(f"\n准备数据用于模型...") print(f"目标变量: {target_col}") # 如果未指定特征列,使用所有数值列(除了Report ID和目标变量) if feature_cols is None: feature_cols = [col for col in df.columns if col not in ['Report ID', target_col] and pd.api.types.is_numeric_dtype(df[col])] print(f"特征变量 ({len(feature_cols)}个): {feature_cols}") # 检查数据质量 print(f"\n数据质量检查:") print(f"总观测数: {len(df)}") print(f"目标变量分布:") print(df[target_col].value_counts().sort_index()) # 移除缺失值 df_clean = df.dropna(subset=[target_col] + feature_cols) if len(df_clean) < len(df): print(f"移除 {len(df) - len(df_clean)} 个包含缺失值的观测") # 准备特征矩阵X和目标向量y X_raw = df_clean[feature_cols].values y_raw = df_clean[target_col].values # 标准化特征(对收敛有帮助) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(X_raw) # 获取选择类别 choice_categories = sorted(np.unique(y_raw)) n_choices = len(choice_categories) n_obs = X_scaled.shape[0] n_features = X_scaled.shape[1] print(f"\n处理后的数据:") print(f"有效观测数: {n_obs}") print(f"选择类别数: {n_choices}") print(f"选择类别: {choice_categories}") print(f"特征数: {n_features}") # 创建三维数组X: (n_obs, n_choices, n_features) # 对于多项Logit,每个观测在每个选择下的特征相同 X = np.zeros((n_obs, n_choices, n_features)) for i in range(n_obs): for j in range(n_choices): X[i, j, :] = X_scaled[i, :] # 将y映射到0到n_choices-1的范围 y_mapping = {category: idx for idx, category in enumerate(choice_categories)} y = np.array([y_mapping[val] for val in y_raw]) print(f"最终X形状: {X.shape}") print(f"最终y形状: {y.shape}") return X, y, choice_categories, feature_cols, scaler, y_mapping def interpret_coding_scheme(coding_info): """解释编码方案""" print("\n" + "="*70) print("数据编码方案解释") print("="*70) for var, codes in coding_info.items(): print(f"\n{var}:") for code, desc in codes.items(): print(f" {code}: {desc}") return coding_info # ==================== 4. 运行随机参数模型 ==================== def run_random_parameters_model(X, y, feature_names, choice_categories, n_draws=100): """运行随机参数Logit模型""" print("\n" + "="*70) print("运行随机参数Logit模型") print("="*70) # 创建模型 model = RandomParametersLogit(n_draws=n_draws, method='halton') print(f"\n模型配置:") print(f" 观测数: {X.shape[0]}") print(f" 选择数: {X.shape[1]}") print(f" 特征数: {X.shape[2]}") print(f" 蒙特卡罗抽样次数: {n_draws}") # 拟合模型 print("\n开始模型拟合...") result = model.fit( X, y, feature_names=feature_names, choice_names=[str(c) for c in choice_categories], verbose=True ) # 输出摘要 print("\n" + "="*70) model.summary() # 计算模型评估指标 n_obs = X.shape[0] n_params = len(model.beta_means) + len(model.beta_std) # 零模型对数似然(所有选择概率相等) null_ll = n_obs * np.log(1 / X.shape[1]) model_ll = -result.fun # 伪R² rho2 = 1 - (model_ll / null_ll) adj_rho2 = 1 - ((model_ll - n_params) / null_ll) # 信息准则 aic = 2 * n_params - 2 * model_ll bic = n_params * np.log(n_obs) - 2 * model_ll # 预测准确率 pred_probs = model.predict_probabilities(X) pred_choices = np.argmax(pred_probs, axis=1) accuracy = np.mean(pred_choices == y) print("\n模型评估指标:") print("-"*40) print(f"零模型对数似然: {null_ll:.4f}") print(f"模型对数似然: {model_ll:.4f}") print(f"伪R² (McFadden's R²): {rho2:.4f}") print(f"调整伪R²: {adj_rho2:.4f}") print(f"AIC: {aic:.4f}") print(f"BIC: {bic:.4f}") print(f"预测准确率: {accuracy:.2%}") print(f"参数数量: {n_params}") metrics = { 'null_ll': null_ll, 'model_ll': model_ll, 'rho2': rho2, 'adj_rho2': adj_rho2, 'aic': aic, 'bic': bic, 'accuracy': accuracy, 'n_params': n_params } return model, metrics # ==================== 5. 结果可视化和解释 ==================== def create_visualizations(model, metrics, feature_names, choice_categories, coding_info): """创建可视化图表""" print("\n" + "="*70) print("生成可视化图表") print("="*70) # 设置图表风格 plt.style.use('seaborn-v0_8-whitegrid') # 创建图表 fig, axes = plt.subplots(2, 3, figsize=(18, 12)) fig.suptitle('AV事故损伤程度随机参数Logit模型分析', fontsize=16, fontweight='bold') # 1. 参数均值估计 x_pos = np.arange(len(feature_names)) colors = plt.cm.Set3(np.linspace(0, 1, len(feature_names))) bars1 = axes[0, 0].bar(x_pos, model.beta_means, color=colors, edgecolor='black', alpha=0.8) axes[0, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5, linewidth=1) axes[0, 0].set_xlabel('特征变量', fontsize=12) axes[0, 0].set_ylabel('参数均值', fontsize=12) axes[0, 0].set_title('随机参数均值估计', fontsize=14, fontweight='bold') axes[0, 0].set_xticks(x_pos) axes[0, 0].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10) # 添加数值标签 for bar, v in zip(bars1, model.beta_means): height = bar.get_height() axes[0, 0].text(bar.get_x() + bar.get_width()/2., height + (0.01 if height >= 0 else -0.03), f'{v:.3f}', ha='center', va='bottom' if height >= 0 else 'top', fontsize=9) # 2. 参数标准差估计 bars2 = axes[0, 1].bar(x_pos, model.beta_std, color=colors, edgecolor='black', alpha=0.8) axes[0, 1].set_xlabel('特征变量', fontsize=12) axes[0, 1].set_ylabel('参数标准差', fontsize=12) axes[0, 1].set_title('随机参数异质性(标准差)', fontsize=14, fontweight='bold') axes[0, 1].set_xticks(x_pos) axes[0, 1].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10) # 添加数值标签 for bar, v in zip(bars2, model.beta_std): height = bar.get_height() axes[0, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=9) # 3. 特征重要性(均值×标准差) importance = np.abs(model.beta_means) * model.beta_std sorted_idx = np.argsort(importance)[::-1] sorted_features = [feature_names[i] for i in sorted_idx] sorted_importance = importance[sorted_idx] colors_sorted = [colors[i] for i in sorted_idx] bars3 = axes[0, 2].barh(range(len(sorted_features)), sorted_importance, color=colors_sorted, edgecolor='black', alpha=0.8) axes[0, 2].set_yticks(range(len(sorted_features))) axes[0, 2].set_yticklabels(sorted_features, fontsize=10) axes[0, 2].set_xlabel('特征重要性(|均值| × 标准差)', fontsize=12) axes[0, 2].set_title('特征重要性排名', fontsize=14, fontweight='bold') # 4. 参数不确定性(误差条形图) axes[1, 0].errorbar(x_pos, model.beta_means, yerr=model.beta_std, fmt='o', color='darkblue', alpha=0.7, capsize=5, capthick=2, markersize=8) axes[1, 0].axhline(y=0, color='r', linestyle='--', alpha=0.5) axes[1, 0].set_xlabel('特征变量', fontsize=12) axes[1, 0].set_ylabel('参数值(均值±标准差)', fontsize=12) axes[1, 0].set_title('参数估计的不确定性', fontsize=14, fontweight='bold') axes[1, 0].set_xticks(x_pos) axes[1, 0].set_xticklabels(feature_names, rotation=45, ha='right', fontsize=10) # 5. 模型拟合指标 metric_names = ['伪R²', '调整伪R²', '预测准确率', 'AIC/1000'] metric_values = [ metrics['rho2'], metrics['adj_rho2'], metrics['accuracy'], metrics['aic'] / 1000 ] bars5 = axes[1, 1].bar(metric_names, metric_values, color=['#4C72B0', '#55A868', '#C44E52', '#8172B2'], alpha=0.8, edgecolor='black') axes[1, 1].set_ylabel('值', fontsize=12) axes[1, 1].set_title('模型拟合指标', fontsize=14, fontweight='bold') # 添加数值标签 for bar, v in zip(bars5, metric_values): height = bar.get_height() axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{v:.3f}', ha='center', va='bottom', fontsize=10, fontweight='bold') # 6. 参数分布模拟(最重要的3个特征) top_3_idx = sorted_idx[:3] # 生成模拟分布 n_sim = 1000 for i, idx in enumerate(top_3_idx): # 模拟参数分布 simulated = np.random.normal(model.beta_means[idx], model.beta_std[idx], n_sim) # 绘制核密度估计 sns.kdeplot(simulated, ax=axes[1, 2], label=feature_names[idx], linewidth=2, alpha=0.7) axes[1, 2].axvline(x=0, color='r', linestyle='--', alpha=0.5, linewidth=1) axes[1, 2].set_xlabel('参数值', fontsize=12) axes[1, 2].set_ylabel('密度', fontsize=12) axes[1, 2].set_title('Top 3特征参数分布', fontsize=14, fontweight='bold') axes[1, 2].legend(fontsize=10) plt.tight_layout() plt.savefig('av_accident_random_params_results.png', dpi=300, bbox_inches='tight') plt.show() print("图表已保存为 'av_accident_random_params_results.png'") # 创建详细分析图表 fig2, axes2 = plt.subplots(1, 2, figsize=(15, 6)) # 散点图:均值 vs 标准差 scatter = axes2[0].scatter(model.beta_means, model.beta_std, c=importance, s=150, alpha=0.7, cmap='viridis', edgecolor='black', linewidth=0.5) # 添加特征标签 for i, (x_val, y_val) in enumerate(zip(model.beta_means, model.beta_std)): axes2[0].text(x_val, y_val, feature_names[i], fontsize=9, ha='center', va='bottom', bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7)) axes2[0].axvline(x=0, color='r', linestyle='--', alpha=0.5, linewidth=1) axes2[0].set_xlabel('参数均值', fontsize=12) axes2[0].set_ylabel('参数标准差', fontsize=12) axes2[0].set_title('特征参数分布散点图', fontsize=14, fontweight='bold') axes2[0].grid(True, alpha=0.3) # 添加颜色条 cbar = plt.colorbar(scatter, ax=axes2[0]) cbar.set_label('特征重要性(|均值|×标准差)', fontsize=10) # 解释图表 axes2[1].axis('off') explanation_text = """ 参数解释: 1. 参数均值: - 正值:增加严重损伤的概率 - 负值:减少严重损伤的概率 2. 参数标准差: - 衡量个体间异质性 - 值越大:特征影响在不同事故间差异越大 - 值越小:特征影响较为一致 3. 特征重要性: - |均值| × 标准差 - 综合考虑了效应大小和异质性 重要发现: """ # 添加前3个最重要的特征 for i, idx in enumerate(top_3_idx[:3]): feature = feature_names[idx] mean_val = model.beta_means[idx] std_val = model.beta_std[idx] if mean_val > 0: effect = "增加严重损伤概率" else: effect = "减少严重损伤概率" axes2[1].text(0.05, 0.85 - i*0.15, f"{i+1}. {feature}:\n" f" 均值={mean_val:.3f}, 标准差={std_val:.3f}\n" f" → {effect}", fontsize=10, bbox=dict(boxstyle='round,pad=0.5', facecolor='lightyellow', alpha=0.8), verticalalignment='top') plt.tight_layout() plt.savefig('av_accident_parameter_interpretation.png', dpi=300, bbox_inches='tight') plt.show() print("参数解释图表已保存为 'av_accident_parameter_interpretation.png'") def interpret_results(model, feature_names, coding_info): """解释模型结果""" print("\n" + "="*70) print("模型结果解释") print("="*70) # 计算特征重要性 importance = np.abs(model.beta_means) * model.beta_std sorted_idx = np.argsort(importance)[::-1] print("\n最重要的特征(按重要性排序):") print("-"*60) for rank, idx in enumerate(sorted_idx): feature = feature_names[idx] mean_val = model.beta_means[idx] std_val = model.beta_std[idx] imp_val = importance[idx] # 解释符号 if mean_val > 0: effect = "增加严重损伤的概率" sign = "+" else: effect = "减少严重损伤的概率" sign = "-" # 解释异质性 if std_val > 0.5: heterogeneity = "(个体间影响差异较大)" elif std_val > 0.2: heterogeneity = "(个体间影响有一定差异)" else: heterogeneity = "(个体间影响较为一致)" print(f"\n{rank+1}. {feature}:") print(f" 参数均值: {mean_val:.4f} ({sign})") print(f" 参数标准差: {std_val:.4f} {heterogeneity}") print(f" 重要性: {imp_val:.4f}") print(f" 解释: {effect}") # 如果该特征在编码信息中,提供更详细的解释 if feature in coding_info: print(f" 编码说明:") for code, desc in coding_info[feature].items(): print(f" {code}: {desc}") print("\n" + "-"*60) print("政策建议:") # 基于最重要的特征提出建议 top_features = [feature_names[i] for i in sorted_idx[:3]] for i, feature in enumerate(top_features): if 'collision' in feature.lower() or '碰撞' in feature: print(f"{i+1}. 碰撞类型:针对常见碰撞类型优化AV的避障算法") elif 'Rush' in feature or '高峰' in feature: print(f"{i+1}. 高峰时段:在高峰时段加强AV监控和调度") elif 'Weather' in feature or '天气' in feature: print(f"{i+1}. 天气条件:恶劣天气下可能需要降低AV行驶速度或调整路线") elif 'Light' in feature or '光照' in feature: print(f"{i+1}. 光照条件:夜间或低光照条件下需要改进AV的感知系统") elif 'Surface' in feature or '路面' in feature: print(f"{i+1}. 路面状况:潮湿或结冰路面需要调整AV的制动策略") elif 'Season' in feature or '季节' in feature: print(f"{i+1}. 季节因素:根据不同季节调整AV的行驶策略") else: print(f"{i+1}. {feature}:需要进一步分析该因素的影响机制") print("\n异质性分析:") high_hetero = np.sum(model.beta_std > 0.5) med_hetero = np.sum((model.beta_std > 0.2) & (model.beta_std <= 0.5)) print(f" - 高异质性特征(标准差>0.5): {high_hetero}个") print(f" - 中等异质性特征(0.2<标准差≤0.5): {med_hetero}个") print(f" - 低异质性特征(标准差≤0.2): {len(model.beta_std) - high_hetero - med_hetero}个") if high_hetero > 0: print(f"\n建议对以下高异质性特征进行进一步细分分析:") for idx in np.where(model.beta_std > 0.5)[0]: print(f" - {feature_names[idx]} (标准差: {model.beta_std[idx]:.3f})") def save_results(model, metrics, feature_names, choice_categories, coding_info, y_actual): """保存结果到Excel""" print("\n" + "="*70) print("保存结果到Excel") print("="*70) # 创建结果DataFrame results_df = pd.DataFrame({ '特征': feature_names, '参数均值': model.beta_means, '参数标准差': model.beta_std, 't统计量(近似)': model.beta_means / (model.beta_std / np.sqrt(model.n_draws)), '重要性(|均值|×标准差)': np.abs(model.beta_means) * model.beta_std, '显著性': ['***' if abs(t) > 2.58 else '**' if abs(t) > 1.96 else '*' if abs(t) > 1.645 else '' for t in model.beta_means / (model.beta_std / np.sqrt(model.n_draws))] }) # 按重要性排序 results_df = results_df.sort_values('重要性(|均值|×标准差)', ascending=False) # 创建模型评估DataFrame metrics_df = pd.DataFrame({ '指标': ['观测数', '选择类别数', '特征数', '蒙特卡罗抽样次数', '负对数似然', '对数似然', '伪R² (McFadden)', '调整伪R²', 'AIC', 'BIC', '预测准确率', '参数数量'], '值': [metrics.get('n_obs', 'N/A'), len(choice_categories), len(feature_names), model.n_draws, model.result.fun, -model.result.fun, metrics['rho2'], metrics['adj_rho2'], metrics['aic'], metrics['bic'], metrics['accuracy'], metrics['n_params']] }) # 创建编码说明DataFrame coding_dfs = [] for var, codes in coding_info.items(): if var in feature_names or 'injured' in var.lower(): temp_df = pd.DataFrame({ '变量': [var] * len(codes), '编码': list(codes.keys()), '含义': list(codes.values()) }) coding_dfs.append(temp_df) coding_df = pd.concat(coding_dfs, ignore_index=True) if coding_dfs else pd.DataFrame() # 保存预测概率 if hasattr(model, 'final_probs'): prob_df = pd.DataFrame(model.final_probs, columns=[f'P(损伤程度={c})' for c in choice_categories]) prob_df['实际损伤程度'] = y_actual prob_df['预测损伤程度'] = np.argmax(model.final_probs, axis=1) prob_df['预测正确'] = prob_df['预测损伤程度'] == prob_df['实际损伤程度'] # 保存到Excel output_file = 'av_accident_random_params_analysis.xlsx' with pd.ExcelWriter(output_file, engine='openpyxl') as writer: results_df.to_excel(writer, sheet_name='参数估计', index=False) metrics_df.to_excel(writer, sheet_name='模型评估', index=False) if not coding_df.empty: coding_df.to_excel(writer, sheet_name='编码说明', index=False) # 保存预测概率 if hasattr(model, 'final_probs'): prob_df.to_excel(writer, sheet_name='预测概率', index=False) print(f"结果已保存到: {output_file}") return output_file # ==================== 6. 主程序 ==================== def main(): """主函数""" print("="*70) print("AV事故损伤程度随机参数Logit模型分析") print("基于实际数据") print("="*70) # 1. 加载实际数据 print("\n[1] 加载数据...") data_file = '编码数据.xlsx' # 修改为您的实际文件名 if os.path.exists(data_file): df, coding_info = load_actual_data(data_file) else: print(f"文件 {data_file} 不存在,使用示例数据") df = load_sample_data() coding_info = get_sample_coding_info() # 2. 解释编码方案 print("\n[2] 解释编码方案...") interpret_coding_scheme(coding_info) # 3. 准备数据 print("\n[3] 准备模型数据...") # 选择特征 - 根据您的数据调整 feature_cols = [ 'Typr of collision', 'Season', 'Weekdays Mon-Fri', 'Rush Hour or not', 'Weather', 'Light condition', 'Roadway Surface', 'Roadway Conditions' ] # 检查哪些特征在实际数据中可用 available_features = [col for col in feature_cols if col in df.columns] if len(available_features) < len(feature_cols): print(f"警告:部分特征在数据中不存在,使用可用特征:{available_features}") feature_cols = available_features X, y, choice_categories, feature_names, scaler, y_mapping = prepare_data_for_model( df, target_col='Highest injured severity', feature_cols=feature_cols ) # 4. 运行随机参数模型 print("\n[4] 运行随机参数Logit模型...") # 设置蒙特卡罗抽样次数(根据数据量调整) if X.shape[0] < 50: n_draws = 50 elif X.shape[0] < 200: n_draws = 100 else: n_draws = 200 print(f"使用 {n_draws} 次蒙特卡罗抽样") model, metrics = run_random_parameters_model( X, y, feature_names, choice_categories, n_draws=n_draws ) # 5. 结果解释 print("\n[5] 结果解释...") interpret_results(model, feature_names, coding_info) # 6. 可视化 print("\n[6] 生成可视化图表...") create_visualizations(model, metrics, feature_names, choice_categories, coding_info) # 7. 保存结果 print("\n[7] 保存结果...") # 添加观测数到metrics中 metrics['n_obs'] = X.shape[0] # 获取实际y值(原始编码) y_actual_original = df['Highest injured severity'].values output_file = save_results(model, metrics, feature_names, choice_categories, coding_info, y_actual_original) # 8. 总结 print("\n" + "="*70) print("分析完成!") print("="*70) print(f"\n输出文件:") print(f" 1. {output_file} - Excel格式的详细结果") print(f" 2. av_accident_random_params_results.png - 主要可视化图表") print(f" 3. av_accident_parameter_interpretation.png - 参数解释图表") print("\n后续步骤建议:") print(" 1. 检查模型收敛性,可能需要增加蒙特卡罗抽样次数") print(" 2. 尝试不同的特征组合和变量转换") print(" 3. 考虑加入交互项或非线性效应") print(" 4. 对高异质性特征进行分组分析") print(" 5. 验证模型在独立样本上的表现") if __name__ == "__main__": main()代码结果图中文字显示口口口口,修改
最新发布
12-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值