import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.feature_selection import SelectFromModel import xgboost as xgb import lightgbm as lgb import warnings import copy from scipy.stats import pearsonr from collections import Counter warnings.filterwarnings('ignore') # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False print("=== 企业违约预测分析优化版 ===") # 1. 数据加载和初步探索 print("\n1. 数据加载...") df_train = pd.read_csv('./train.csv', low_memory=False) df_test = pd.read_csv('./test.csv', low_memory=False) data_dict = pd.read_csv('./数据字典.csv', encoding='utf-8') print(f"训练集形状: {df_train.shape}") print(f"测试集形状: {df_test.shape}") # 2. 数据质量检查 print("\n2. 数据质量检查...") # 检查缺失值 missing_train = df_train.isnull().sum() missing_test = df_test.isnull().sum() print("训练集缺失值情况:") print(missing_train[missing_train > 0].sort_values(ascending=False).head(10)) print("\n测试集缺失值情况:") print(missing_test[missing_test > 0].sort_values(ascending=False).head(10)) # 检查目标变量分布 target_dist = df_train['target'].value_counts() print(f"\n目标变量分布:\n{target_dist}") print(f"违约率: {df_train['target'].mean():.4f}") # 3. 数据清洗 print("\n3. 数据清洗...") # 处理分类变量 categorical_columns = ['province', 'city', 'industry_l1_name', 'industry_l2_name', 'industry_l3_name', 'industry_l4_name', 'honor_titles', 'sci_tech_ent_tags', 'top100_tags', 'company_on_scale'] # 对分类变量进行编码 label_encoders = {} for col in categorical_columns: if col in df_train.columns: le = LabelEncoder() # 合并训练集和测试集进行编码以确保一致性 combined = pd.concat([df_train[col].fillna('missing'), df_test[col].fillna('missing')]) le.fit(combined) df_train[col] = le.transform(df_train[col].fillna('missing')) df_test[col] = le.transform(df_test[col].fillna('missing')) label_encoders[col] = le # 处理数值型变量的缺失值 numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist() numeric_columns = [col for col in numeric_columns if col not in ['company_id', 'target']] for col in numeric_columns: if col in df_train.columns: # 使用中位数填充 median_val = df_train[col].median() df_train[col].fillna(median_val, inplace=True) if col in df_test.columns: df_test[col].fillna(median_val, inplace=True) # 4. 特征工程 print("\n4. 特征工程...") def safe_divide(a, b): """安全除法,避免除零和无穷大值""" with np.errstate(divide='ignore', invalid='ignore'): result = np.divide(a, b) result[~np.isfinite(result)] = 0 # 将无穷大和NaN替换为0 return result # 财务健康指标 if all(col in df_train.columns for col in ['total_assets', 'total_liabilities', 'total_equity']): df_train['asset_liability_ratio'] = safe_divide(df_train['total_liabilities'], df_train['total_assets'] + 1) if all(col in df_test.columns for col in ['total_assets', 'total_liabilities', 'total_equity']): df_test['asset_liability_ratio'] = safe_divide(df_test['total_liabilities'], df_test['total_assets'] + 1) df_train['equity_ratio'] = safe_divide(df_train['total_equity'], df_train['total_assets'] + 1) if all(col in df_test.columns for col in ['total_assets', 'total_equity']): df_test['equity_ratio'] = safe_divide(df_test['total_equity'], df_test['total_assets'] + 1) # 添加缺失的debt_to_equity特征 df_train['debt_to_equity'] = safe_divide(df_train['total_liabilities'], df_train['total_equity'] + 1) if all(col in df_test.columns for col in ['total_liabilities', 'total_equity']): df_test['debt_to_equity'] = safe_divide(df_test['total_liabilities'], df_test['total_equity'] + 1) # 增长性指标 if all(col in df_train.columns for col in ['net_profit', 'total_sales']): df_train['profit_margin'] = safe_divide(df_train['net_profit'], df_train['total_sales'] + 1) if all(col in df_test.columns for col in ['net_profit', 'total_sales']): df_test['profit_margin'] = safe_divide(df_test['net_profit'], df_test['total_sales'] + 1) # 变更频率指标 change_features = [col for col in df_train.columns if 'chg_cnt' in col] if change_features: df_train['total_change_12m'] = df_train[[col for col in change_features if '12m' in col]].sum(axis=1) # 确保测试集也有相同的特征 test_change_features = [col for col in df_test.columns if 'chg_cnt' in col and '12m' in col] if test_change_features: df_test['total_change_12m'] = df_test[test_change_features].sum(axis=1) else: df_test['total_change_12m'] = 0 # 风险指标 risk_features = [col for col in df_train.columns if any(x in col for x in ['abn_cnt', 'pun_cnt', 'dishonest'])] if risk_features: df_train['total_risk_12m'] = df_train[[col for col in risk_features if '12m' in col]].sum(axis=1) # 确保测试集也有相同的特征 test_risk_features = [col for col in df_test.columns if any(x in col for x in ['abn_cnt', 'pun_cnt', 'dishonest']) and '12m' in col] if test_risk_features: df_test['total_risk_12m'] = df_test[test_risk_features].sum(axis=1) else: df_test['total_risk_12m'] = 0 # 5. 探索性数据分析 print("\n5. 探索性数据分析...") plt.figure(figsize=(15, 10)) plt.subplot(2, 3, 1) df_train['target'].value_counts().plot.pie(autopct='%1.1f%%', colors=['lightblue', 'lightcoral']) plt.title('目标变量分布') # 关键财务指标分析 key_financial_metrics = ['total_assets', 'total_liabilities', 'total_equity', 'total_sales', 'total_profit', 'net_profit'] plt.subplot(2, 3, 2) for metric in key_financial_metrics[:3]: if metric in df_train.columns: # 使用对数变换处理极端值 safe_data = np.log1p(np.abs(df_train[metric])) * np.sign(df_train[metric]) sns.kdeplot(safe_data, label=metric) plt.title('关键财务指标分布') plt.legend() plt.subplot(2, 3, 3) # 财务比率分析 if 'debt_to_equity' in df_train.columns: sns.boxplot(x='target', y='debt_to_equity', data=df_train) plt.title('资产负债率 vs 违约') # 时间序列特征分析 time_features = [col for col in df_train.columns if any(x in col for x in ['12m', '36m', '60m'])] plt.subplot(2, 3, 4) if len(time_features) > 0: # 选择第一个时间特征进行展示 feature = time_features[0] df_train.groupby('target')[feature].mean().plot(kind='bar') plt.title(f'{feature} vs 违约') # 地区分布 plt.subplot(2, 3, 5) if 'province' in df_train.columns: province_default = df_train.groupby('province')['target'].mean().sort_values(ascending=False).head(10) province_default.plot(kind='bar') plt.title('各省份违约率Top10') # 行业分布 plt.subplot(2, 3, 6) if 'industry_l1_name' in df_train.columns: industry_default = df_train.groupby('industry_l1_name')['target'].mean().sort_values(ascending=False).head(10) industry_default.plot(kind='bar') plt.title('各行业违约率Top10') plt.tight_layout() plt.show() # 6. 改进的假指标识别 - 进一步优化 print("\n6. 改进的假指标识别...") def detect_fake_features_optimized(df, target_col='target', min_fake=2, max_fake=5): """ 进一步优化的假指标检测方法 """ fake_candidates = [] # 方法1: 基于随机森林的特征重要性 X_temp = df.drop(['company_id', target_col], axis=1, errors='ignore') y_temp = df[target_col] numeric_features = X_temp.select_dtypes(include=[np.number]).columns X_temp = X_temp[numeric_features] X_temp = X_temp.replace([np.inf, -np.inf], np.nan).fillna(X_temp.median()) rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10) rf.fit(X_temp, y_temp) feature_importance = pd.DataFrame({ 'feature': numeric_features, 'importance': rf.feature_importances_ }).sort_values('importance', ascending=False) # 方法2: 基于XGBoost的特征重要性 xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, max_depth=6) xgb_model.fit(X_temp, y_temp) xgb_importance = pd.DataFrame({ 'feature': numeric_features, 'importance': xgb_model.feature_importances_ }).sort_values('importance', ascending=False) # 方法3: 基于相关性的分析 feature_corr = [] for col in numeric_features: try: clean_data = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) if len(clean_data.unique()) > 1: corr, p_value = pearsonr(clean_data, df[target_col]) if not np.isnan(corr): feature_corr.append((col, abs(corr), corr, p_value)) except: continue feature_corr.sort(key=lambda x: x[1], reverse=True) # 方法4: 基于方差的特征选择 low_variance_features = [] for col in numeric_features: if df[col].std() < 0.001: low_variance_features.append(col) # 综合判断:使用加权评分系统 feature_scores = {} for feature in numeric_features: score = 0 # 随机森林重要性评分 (权重0.4) rf_rank = feature_importance[feature_importance['feature'] == feature].index if len(rf_rank) > 0: rf_score = 1 - (rf_rank[0] / len(feature_importance)) score += rf_score * 0.4 # XGBoost重要性评分 (权重0.3) xgb_rank = xgb_importance[xgb_importance['feature'] == feature].index if len(xgb_rank) > 0: xgb_score = 1 - (xgb_rank[0] / len(xgb_importance)) score += xgb_score * 0.3 # 相关性评分 (权重0.2) corr_info = next((item for item in feature_corr if item[0] == feature), None) if corr_info: corr_score = corr_info[1] # 使用绝对值 score += corr_score * 0.2 # 方差评分 (权重0.1) if feature in low_variance_features: score -= 0.1 feature_scores[feature] = score # 按分数排序,选择分数最低的特征作为假指标候选 sorted_features = sorted(feature_scores.items(), key=lambda x: x[1]) # 排除明显重要的特征(基于业务逻辑) important_keywords = ['lhc_cnt', 'court', 'justice', 'dishonest', 'pun_cnt', 'abn_cnt', 'total_assets', 'total_liabilities', 'total_equity', 'net_profit', 'legal_person', 'bus_chg', 'establish_year'] fake_candidates = [] for feature, score in sorted_features: if len(fake_candidates) >= max_fake: break # 如果特征不包含重要关键词且分数很低,则认为是假指标 if not any(keyword in feature for keyword in important_keywords) and score < 0.1: fake_candidates.append(feature) # 确保识别足够数量的假指标 if len(fake_candidates) < min_fake: # 从分数最低的特征中补充 for feature, score in sorted_features: if feature not in fake_candidates and len(fake_candidates) < max_fake: fake_candidates.append(feature) return fake_candidates[:max_fake] # 执行优化的假指标检测 potential_fake_features = detect_fake_features_optimized(df_train) print(f"识别出的可能假指标 ({len(potential_fake_features)}个): {potential_fake_features}") # 7. 改进的模型构建 - 进一步优化参数 print("\n7. 改进的模型构建...") # 准备特征数据 common_features = list(set(df_train.columns) & set(df_test.columns)) common_features = [col for col in common_features if col not in ['company_id', 'target']] common_features = [col for col in common_features if df_train[col].dtype in [np.int64, np.float64]] # 移除假指标 features_to_use = [col for col in common_features if col not in potential_fake_features] print(f"使用的特征数量: {len(features_to_use)}") X = df_train[features_to_use] y = df_train['target'] X_test = df_test[features_to_use] # 填充缺失值并处理无穷大 X = X.replace([np.inf, -np.inf], np.nan) X_test = X_test.replace([np.inf, -np.inf], np.nan) X = X.fillna(X.median()) X_test = X_test.fillna(X.median()) # 特征缩放 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_test_scaled = scaler.transform(X_test) # 划分训练验证集 X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y) # 优化的评分函数 - 更严格的违约率控制 def custom_score_optimized(y_true, y_pred_proba_val): """严格按照评分规则:0.5*AUC + 0.3*Recall + 0.2*Precision""" auc_score = roc_auc_score(y_true, y_pred_proba_val) best_score_val = 0 best_threshold_val = 0.5 actual_default_rate_val = y_true.mean() # 更严格的违约率范围控制 min_default_rate = max(0.03, actual_default_rate_val * 0.7) # 提高下限 max_default_rate = min(0.10, actual_default_rate_val * 1.3) # 降低上限 # 更精细的阈值搜索 for threshold_val in np.arange(0.01, 0.99, 0.002): y_pred_val = (y_pred_proba_val > threshold_val).astype(int) pred_default_rate_val = y_pred_val.mean() # 严格的违约率约束 if pred_default_rate_val < min_default_rate or pred_default_rate_val > max_default_rate: continue precision_val = precision_score(y_true, y_pred_val, zero_division=0) recall_val = recall_score(y_true, y_pred_val, zero_division=0) # 严格按照评分规则 score_val = 0.5 * auc_score + 0.3 * recall_val + 0.2 * precision_val if score_val > best_score_val: best_score_val = score_val best_threshold_val = threshold_val # 如果没有找到合理阈值,使用基于F1的阈值,但限制违约率 if best_score_val == 0: print(" 警告:未找到合理违约率的阈值,使用F1优化(带违约率约束)") for threshold_val in np.arange(0.01, 0.99, 0.01): y_pred_val = (y_pred_proba_val > threshold_val).astype(int) pred_default_rate_val = y_pred_val.mean() # 仍然保持违约率约束 if pred_default_rate_val < min_default_rate or pred_default_rate_val > max_default_rate: continue f1_val = f1_score(y_true, y_pred_val, zero_division=0) score_val = f1_val if score_val > best_score_val: best_score_val = score_val best_threshold_val = threshold_val return best_score_val, best_threshold_val # 进一步优化的模型参数 models = { 'Logistic Regression': LogisticRegression( random_state=42, class_weight='balanced', C=0.8, max_iter=2000, solver='liblinear' # 降低C值,增加正则化 ), 'Random Forest': RandomForestClassifier( n_estimators=250, random_state=42, max_depth=12, # 增加树数量,降低深度 class_weight='balanced', min_samples_split=25, # 增加分裂要求 min_samples_leaf=15, max_features=0.6, # 更保守的参数 bootstrap=True, n_jobs=-1 ), 'Gradient Boosting': GradientBoostingClassifier( n_estimators=250, random_state=42, max_depth=5, # 增加树数量,降低深度 learning_rate=0.08, subsample=0.75, # 降低学习率和采样率 min_samples_split=25, min_samples_leaf=15, # 增加分裂要求 max_features=0.7 ), 'XGBoost': xgb.XGBClassifier( n_estimators=250, random_state=42, max_depth=5, # 增加树数量,降低深度 learning_rate=0.08, subsample=0.75, colsample_bytree=0.75, reg_alpha=0.15, reg_lambda=0.15, gamma=0.1, # 增加正则化 scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), eval_metric='logloss', n_jobs=-1 ), 'LightGBM': lgb.LGBMClassifier( n_estimators=300, random_state=42, max_depth=7, learning_rate=0.06, subsample=0.75, colsample_bytree=0.75, reg_alpha=0.2, reg_lambda=0.2, min_child_samples=60, # 增加正则化和最小样本 num_leaves=35, boosting_type='gbdt', n_jobs=-1, class_weight='balanced', min_split_gain=0.02, subsample_freq=1, verbose=-1 ) } print("\n模型性能比较:") best_model = None best_score_main = 0 best_threshold_main = 0.5 model_performance = {} for name, model in models.items(): try: print(f"训练 {name}...") model.fit(X_train, y_train) y_pred_proba_main = model.predict_proba(X_val)[:, 1] score_main, threshold_main = custom_score_optimized(y_val, y_pred_proba_main) auc_main = roc_auc_score(y_val, y_pred_proba_main) y_pred_main = (y_pred_proba_main > threshold_main).astype(int) pred_default_rate_main = y_pred_main.mean() precision_main = precision_score(y_val, y_pred_main, zero_division=0) recall_main = recall_score(y_val, y_pred_main, zero_division=0) f1_main = f1_score(y_val, y_pred_main, zero_division=0) print(f"{name:20} AUC: {auc_main:.4f}, 自定义评分: {score_main:.4f}, 阈值: {threshold_main:.3f}") print( f"{'':20} Precision: {precision_main:.4f}, Recall: {recall_main:.4f}, F1: {f1_main:.4f}, 预测违约率: {pred_default_rate_main:.4f}") model_performance[name] = { 'model': model, 'score': score_main, 'auc': auc_main, 'threshold': threshold_main, 'precision': precision_main, 'recall': recall_main, 'f1': f1_main } if score_main > best_score_main: best_score_main = score_main best_model = model best_threshold_main = threshold_main except Exception as e: print(f"{name} 训练失败: {str(e)}") print( f"\n最佳模型: {type(best_model).__name__}, 自定义评分: {best_score_main:.4f}, 最佳阈值: {best_threshold_main:.3f}") # 8. 改进的模型集成 print("\n8. 改进的模型集成...") if len(model_performance) >= 2: # 选择表现最好的模型进行集成 top_models = sorted(model_performance.items(), key=lambda x: x[1]['score'], reverse=True)[:3] print(f"集成模型候选: {[name for name, _ in top_models]}") # 创建软投票集成 estimators_list = [(name, perf['model']) for name, perf in top_models] voting_clf = VotingClassifier(estimators=estimators_list, voting='soft') voting_clf.fit(X_train, y_train) # 评估集成模型 y_pred_proba_ensemble = voting_clf.predict_proba(X_val)[:, 1] ensemble_score, ensemble_threshold = custom_score_optimized(y_val, y_pred_proba_ensemble) ensemble_auc = roc_auc_score(y_val, y_pred_proba_ensemble) print(f"集成模型性能: AUC: {ensemble_auc:.4f}, 自定义评分: {ensemble_score:.4f}") # 只有当集成模型明显更好时才使用 improvement_threshold = 0.003 # 进一步降低要求 if ensemble_score > best_score_main + improvement_threshold: best_model = voting_clf best_threshold_main = ensemble_threshold print(f"使用集成模型作为最终模型,提升: {ensemble_score - best_score_main:.4f}") else: print("使用单一最佳模型作为最终模型") else: print("模型数量不足,跳过集成") # 9. 模型解释 print("\n9. 模型解释...") if hasattr(best_model, 'feature_importances_'): # 树模型的特征重要性 importance_df = pd.DataFrame({ 'feature': features_to_use, 'importance': best_model.feature_importances_ }).sort_values('importance', ascending=False) plt.figure(figsize=(10, 8)) sns.barplot(data=importance_df.head(15), y='feature', x='importance') plt.title('Top 15 重要特征') plt.tight_layout() plt.show() print("Top 10 重要特征:") print(importance_df.head(10)) # 10. 稳健的测试集预测 - 更严格的违约率控制 print("\n10. 稳健的测试集预测...") # 在整个训练集上重新训练最佳模型 final_model = copy.deepcopy(best_model) final_model.fit(X_scaled, y) # 预测测试集概率 test_predictions_proba = final_model.predict_proba(X_test_scaled)[:, 1] # 使用最佳阈值进行预测 test_predictions = (test_predictions_proba > best_threshold_main).astype(int) # 验证预测违约率 pred_default_rate = test_predictions.mean() actual_default_rate = y.mean() print(f"\n预测违约率验证:") print(f"实际训练集违约率: {actual_default_rate:.4f}") print(f"预测测试集违约率: {pred_default_rate:.4f}") # 更严格的违约率控制 target_min_rate = actual_default_rate * 0.8 # 提高下限 target_max_rate = actual_default_rate * 1.25 # 降低上限 if pred_default_rate < target_min_rate or pred_default_rate > target_max_rate: print("预测违约率需要调整...") # 使用更保守的分位数调整,目标违约率更接近实际值 target_rate = min(max(actual_default_rate * 1.1, target_min_rate), target_max_rate) adjusted_threshold = np.percentile(test_predictions_proba, 100 * (1 - target_rate)) # 在最佳阈值和调整阈值之间取更保守的平衡 balanced_threshold = (best_threshold_main * 0.6 + adjusted_threshold * 0.4) test_predictions = (test_predictions_proba > balanced_threshold).astype(int) best_threshold_main = balanced_threshold pred_default_rate = test_predictions.mean() print(f"平衡后阈值: {best_threshold_main:.3f}, 调整后预测违约率: {pred_default_rate:.4f}") print(f"目标违约率范围: [{target_min_rate:.4f}, {target_max_rate:.4f}]") # 11. 生成提交文件 print("\n11. 生成提交文件...") submit_df = pd.DataFrame({ 'uuid': df_test['company_id'], 'proba': test_predictions_proba, 'prediction': test_predictions }) # 保存结果 submit_df.to_csv('submission.csv', index=False) print("预测结果已保存到 submission.csv") # 输出预测结果统计 print(f"\n预测结果统计:") print(f"预测为违约的企业数量: {test_predictions.sum()}") print(f"预测违约率: {test_predictions.mean():.4f}") print(f"实际训练集违约率: {df_train['target'].mean():.4f}") print(f"概率预测范围: [{test_predictions_proba.min():.4f}, {test_predictions_proba.max():.4f}]") print(f"使用的阈值: {best_threshold_main:.3f}") # 12. 最终模型评估 print("\n12. 最终模型评估...") # 交叉验证 cv_scores = cross_val_score(final_model, X_scaled, y, cv=5, scoring='roc_auc') print(f"5折交叉验证AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") # 计算最终评分 - 严格按照评分规则 y_val_pred_proba = final_model.predict_proba(X_val)[:, 1] y_val_pred = (y_val_pred_proba > best_threshold_main).astype(int) final_auc = roc_auc_score(y_val, y_val_pred_proba) final_precision = precision_score(y_val, y_val_pred, zero_division=0) final_recall = recall_score(y_val, y_val_pred, zero_division=0) # 严格按照评分规则:0.5*AUC + 0.3*Recall + 0.2*Precision final_score = 0.5 * final_auc + 0.3 * final_recall + 0.2 * final_precision print(f"\n最终评分计算:") print(f"AUC: {final_auc:.4f}") print(f"Precision: {final_precision:.4f}") print(f"Recall: {final_recall:.4f}") print(f"最终分数: {final_score * 100:.2f}") # 输出分类报告 print(f"\n分类报告:") print(classification_report(y_val, y_val_pred)) # 特征分析总结 print(f"\n=== 分析总结 ===") print(f"• 原始特征数量: {len(df_train.columns) - 2}") print(f"• 使用特征数量: {len(features_to_use)}") print(f"• 识别出的假指标: {len(potential_fake_features)}个") print(f"• 训练集样本数: {len(df_train)}") print(f"• 测试集样本数: {len(df_test)}") print(f"• 最佳模型: {type(final_model).__name__}") print(f"• 最佳阈值: {best_threshold_main:.3f}") print(f"• 验证集AUC: {final_auc:.4f}") print(f"• 交叉验证AUC: {cv_scores.mean():.4f}") print(f"• 预估最终分数: {final_score * 100:.2f}") print("\n优化分析完成!")
修改程序
最新发布