import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectFromModel
import xgboost as xgb
import lightgbm as lgb
import warnings
import copy
from scipy.stats import pearsonr
from collections import Counter
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
print("=== 企业违约预测分析优化版 ===")
# 1. 数据加载和初步探索
print("\n1. 数据加载...")
df_train = pd.read_csv('./train.csv', low_memory=False)
df_test = pd.read_csv('./test.csv', low_memory=False)
data_dict = pd.read_csv('./数据字典.csv', encoding='utf-8')
print(f"训练集形状: {df_train.shape}")
print(f"测试集形状: {df_test.shape}")
# 2. 数据质量检查
print("\n2. 数据质量检查...")
# 检查缺失值
missing_train = df_train.isnull().sum()
missing_test = df_test.isnull().sum()
print("训练集缺失值情况:")
print(missing_train[missing_train > 0].sort_values(ascending=False).head(10))
print("\n测试集缺失值情况:")
print(missing_test[missing_test > 0].sort_values(ascending=False).head(10))
# 检查目标变量分布
target_dist = df_train['target'].value_counts()
print(f"\n目标变量分布:\n{target_dist}")
print(f"违约率: {df_train['target'].mean():.4f}")
# 3. 数据清洗
print("\n3. 数据清洗...")
# 处理分类变量
categorical_columns = ['province', 'city', 'industry_l1_name', 'industry_l2_name',
'industry_l3_name', 'industry_l4_name', 'honor_titles',
'sci_tech_ent_tags', 'top100_tags', 'company_on_scale']
# 对分类变量进行编码
label_encoders = {}
for col in categorical_columns:
if col in df_train.columns:
le = LabelEncoder()
# 合并训练集和测试集进行编码以确保一致性
combined = pd.concat([df_train[col].fillna('missing'),
df_test[col].fillna('missing')])
le.fit(combined)
df_train[col] = le.transform(df_train[col].fillna('missing'))
df_test[col] = le.transform(df_test[col].fillna('missing'))
label_encoders[col] = le
# 处理数值型变量的缺失值
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in ['company_id', 'target']]
for col in numeric_columns:
if col in df_train.columns:
# 使用中位数填充
median_val = df_train[col].median()
df_train[col].fillna(median_val, inplace=True)
if col in df_test.columns:
df_test[col].fillna(median_val, inplace=True)
# 4. 特征工程
print("\n4. 特征工程...")
def safe_divide(a, b):
"""安全除法,避免除零和无穷大值"""
with np.errstate(divide='ignore', invalid='ignore'):
result = np.divide(a, b)
result[~np.isfinite(result)] = 0 # 将无穷大和NaN替换为0
return result
# 财务健康指标
if all(col in df_train.columns for col in ['total_assets', 'total_liabilities', 'total_equity']):
df_train['asset_liability_ratio'] = safe_divide(df_train['total_liabilities'], df_train['total_assets'] + 1)
if all(col in df_test.columns for col in ['total_assets', 'total_liabilities', 'total_equity']):
df_test['asset_liability_ratio'] = safe_divide(df_test['total_liabilities'], df_test['total_assets'] + 1)
df_train['equity_ratio'] = safe_divide(df_train['total_equity'], df_train['total_assets'] + 1)
if all(col in df_test.columns for col in ['total_assets', 'total_equity']):
df_test['equity_ratio'] = safe_divide(df_test['total_equity'], df_test['total_assets'] + 1)
# 添加缺失的debt_to_equity特征
df_train['debt_to_equity'] = safe_divide(df_train['total_liabilities'], df_train['total_equity'] + 1)
if all(col in df_test.columns for col in ['total_liabilities', 'total_equity']):
df_test['debt_to_equity'] = safe_divide(df_test['total_liabilities'], df_test['total_equity'] + 1)
# 增长性指标
if all(col in df_train.columns for col in ['net_profit', 'total_sales']):
df_train['profit_margin'] = safe_divide(df_train['net_profit'], df_train['total_sales'] + 1)
if all(col in df_test.columns for col in ['net_profit', 'total_sales']):
df_test['profit_margin'] = safe_divide(df_test['net_profit'], df_test['total_sales'] + 1)
# 变更频率指标
change_features = [col for col in df_train.columns if 'chg_cnt' in col]
if change_features:
df_train['total_change_12m'] = df_train[[col for col in change_features if '12m' in col]].sum(axis=1)
# 确保测试集也有相同的特征
test_change_features = [col for col in df_test.columns if 'chg_cnt' in col and '12m' in col]
if test_change_features:
df_test['total_change_12m'] = df_test[test_change_features].sum(axis=1)
else:
df_test['total_change_12m'] = 0
# 风险指标
risk_features = [col for col in df_train.columns if any(x in col for x in ['abn_cnt', 'pun_cnt', 'dishonest'])]
if risk_features:
df_train['total_risk_12m'] = df_train[[col for col in risk_features if '12m' in col]].sum(axis=1)
# 确保测试集也有相同的特征
test_risk_features = [col for col in df_test.columns if
any(x in col for x in ['abn_cnt', 'pun_cnt', 'dishonest']) and '12m' in col]
if test_risk_features:
df_test['total_risk_12m'] = df_test[test_risk_features].sum(axis=1)
else:
df_test['total_risk_12m'] = 0
# 5. 探索性数据分析
print("\n5. 探索性数据分析...")
plt.figure(figsize=(15, 10))
plt.subplot(2, 3, 1)
df_train['target'].value_counts().plot.pie(autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('目标变量分布')
# 关键财务指标分析
key_financial_metrics = ['total_assets', 'total_liabilities', 'total_equity',
'total_sales', 'total_profit', 'net_profit']
plt.subplot(2, 3, 2)
for metric in key_financial_metrics[:3]:
if metric in df_train.columns:
# 使用对数变换处理极端值
safe_data = np.log1p(np.abs(df_train[metric])) * np.sign(df_train[metric])
sns.kdeplot(safe_data, label=metric)
plt.title('关键财务指标分布')
plt.legend()
plt.subplot(2, 3, 3)
# 财务比率分析
if 'debt_to_equity' in df_train.columns:
sns.boxplot(x='target', y='debt_to_equity', data=df_train)
plt.title('资产负债率 vs 违约')
# 时间序列特征分析
time_features = [col for col in df_train.columns if any(x in col for x in ['12m', '36m', '60m'])]
plt.subplot(2, 3, 4)
if len(time_features) > 0:
# 选择第一个时间特征进行展示
feature = time_features[0]
df_train.groupby('target')[feature].mean().plot(kind='bar')
plt.title(f'{feature} vs 违约')
# 地区分布
plt.subplot(2, 3, 5)
if 'province' in df_train.columns:
province_default = df_train.groupby('province')['target'].mean().sort_values(ascending=False).head(10)
province_default.plot(kind='bar')
plt.title('各省份违约率Top10')
# 行业分布
plt.subplot(2, 3, 6)
if 'industry_l1_name' in df_train.columns:
industry_default = df_train.groupby('industry_l1_name')['target'].mean().sort_values(ascending=False).head(10)
industry_default.plot(kind='bar')
plt.title('各行业违约率Top10')
plt.tight_layout()
plt.show()
# 6. 改进的假指标识别 - 进一步优化
print("\n6. 改进的假指标识别...")
def detect_fake_features_optimized(df, target_col='target', min_fake=2, max_fake=5):
"""
进一步优化的假指标检测方法
"""
fake_candidates = []
# 方法1: 基于随机森林的特征重要性
X_temp = df.drop(['company_id', target_col], axis=1, errors='ignore')
y_temp = df[target_col]
numeric_features = X_temp.select_dtypes(include=[np.number]).columns
X_temp = X_temp[numeric_features]
X_temp = X_temp.replace([np.inf, -np.inf], np.nan).fillna(X_temp.median())
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_temp, y_temp)
feature_importance = pd.DataFrame({
'feature': numeric_features,
'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)
# 方法2: 基于XGBoost的特征重要性
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, max_depth=6)
xgb_model.fit(X_temp, y_temp)
xgb_importance = pd.DataFrame({
'feature': numeric_features,
'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)
# 方法3: 基于相关性的分析
feature_corr = []
for col in numeric_features:
try:
clean_data = df[col].replace([np.inf, -np.inf], np.nan).fillna(0)
if len(clean_data.unique()) > 1:
corr, p_value = pearsonr(clean_data, df[target_col])
if not np.isnan(corr):
feature_corr.append((col, abs(corr), corr, p_value))
except:
continue
feature_corr.sort(key=lambda x: x[1], reverse=True)
# 方法4: 基于方差的特征选择
low_variance_features = []
for col in numeric_features:
if df[col].std() < 0.001:
low_variance_features.append(col)
# 综合判断:使用加权评分系统
feature_scores = {}
for feature in numeric_features:
score = 0
# 随机森林重要性评分 (权重0.4)
rf_rank = feature_importance[feature_importance['feature'] == feature].index
if len(rf_rank) > 0:
rf_score = 1 - (rf_rank[0] / len(feature_importance))
score += rf_score * 0.4
# XGBoost重要性评分 (权重0.3)
xgb_rank = xgb_importance[xgb_importance['feature'] == feature].index
if len(xgb_rank) > 0:
xgb_score = 1 - (xgb_rank[0] / len(xgb_importance))
score += xgb_score * 0.3
# 相关性评分 (权重0.2)
corr_info = next((item for item in feature_corr if item[0] == feature), None)
if corr_info:
corr_score = corr_info[1] # 使用绝对值
score += corr_score * 0.2
# 方差评分 (权重0.1)
if feature in low_variance_features:
score -= 0.1
feature_scores[feature] = score
# 按分数排序,选择分数最低的特征作为假指标候选
sorted_features = sorted(feature_scores.items(), key=lambda x: x[1])
# 排除明显重要的特征(基于业务逻辑)
important_keywords = ['lhc_cnt', 'court', 'justice', 'dishonest', 'pun_cnt', 'abn_cnt',
'total_assets', 'total_liabilities', 'total_equity', 'net_profit',
'legal_person', 'bus_chg', 'establish_year']
fake_candidates = []
for feature, score in sorted_features:
if len(fake_candidates) >= max_fake:
break
# 如果特征不包含重要关键词且分数很低,则认为是假指标
if not any(keyword in feature for keyword in important_keywords) and score < 0.1:
fake_candidates.append(feature)
# 确保识别足够数量的假指标
if len(fake_candidates) < min_fake:
# 从分数最低的特征中补充
for feature, score in sorted_features:
if feature not in fake_candidates and len(fake_candidates) < max_fake:
fake_candidates.append(feature)
return fake_candidates[:max_fake]
# 执行优化的假指标检测
potential_fake_features = detect_fake_features_optimized(df_train)
print(f"识别出的可能假指标 ({len(potential_fake_features)}个): {potential_fake_features}")
# 7. 改进的模型构建 - 进一步优化参数
print("\n7. 改进的模型构建...")
# 准备特征数据
common_features = list(set(df_train.columns) & set(df_test.columns))
common_features = [col for col in common_features if col not in ['company_id', 'target']]
common_features = [col for col in common_features if df_train[col].dtype in [np.int64, np.float64]]
# 移除假指标
features_to_use = [col for col in common_features if col not in potential_fake_features]
print(f"使用的特征数量: {len(features_to_use)}")
X = df_train[features_to_use]
y = df_train['target']
X_test = df_test[features_to_use]
# 填充缺失值并处理无穷大
X = X.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median())
X_test = X_test.fillna(X.median())
# 特征缩放
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)
# 划分训练验证集
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2,
random_state=42, stratify=y)
# 优化的评分函数 - 更严格的违约率控制
def custom_score_optimized(y_true, y_pred_proba_val):
"""严格按照评分规则:0.5*AUC + 0.3*Recall + 0.2*Precision"""
auc_score = roc_auc_score(y_true, y_pred_proba_val)
best_score_val = 0
best_threshold_val = 0.5
actual_default_rate_val = y_true.mean()
# 更严格的违约率范围控制
min_default_rate = max(0.03, actual_default_rate_val * 0.7) # 提高下限
max_default_rate = min(0.10, actual_default_rate_val * 1.3) # 降低上限
# 更精细的阈值搜索
for threshold_val in np.arange(0.01, 0.99, 0.002):
y_pred_val = (y_pred_proba_val > threshold_val).astype(int)
pred_default_rate_val = y_pred_val.mean()
# 严格的违约率约束
if pred_default_rate_val < min_default_rate or pred_default_rate_val > max_default_rate:
continue
precision_val = precision_score(y_true, y_pred_val, zero_division=0)
recall_val = recall_score(y_true, y_pred_val, zero_division=0)
# 严格按照评分规则
score_val = 0.5 * auc_score + 0.3 * recall_val + 0.2 * precision_val
if score_val > best_score_val:
best_score_val = score_val
best_threshold_val = threshold_val
# 如果没有找到合理阈值,使用基于F1的阈值,但限制违约率
if best_score_val == 0:
print(" 警告:未找到合理违约率的阈值,使用F1优化(带违约率约束)")
for threshold_val in np.arange(0.01, 0.99, 0.01):
y_pred_val = (y_pred_proba_val > threshold_val).astype(int)
pred_default_rate_val = y_pred_val.mean()
# 仍然保持违约率约束
if pred_default_rate_val < min_default_rate or pred_default_rate_val > max_default_rate:
continue
f1_val = f1_score(y_true, y_pred_val, zero_division=0)
score_val = f1_val
if score_val > best_score_val:
best_score_val = score_val
best_threshold_val = threshold_val
return best_score_val, best_threshold_val
# 进一步优化的模型参数
models = {
'Logistic Regression': LogisticRegression(
random_state=42, class_weight='balanced',
C=0.8, max_iter=2000, solver='liblinear' # 降低C值,增加正则化
),
'Random Forest': RandomForestClassifier(
n_estimators=250, random_state=42, max_depth=12, # 增加树数量,降低深度
class_weight='balanced', min_samples_split=25, # 增加分裂要求
min_samples_leaf=15, max_features=0.6, # 更保守的参数
bootstrap=True, n_jobs=-1
),
'Gradient Boosting': GradientBoostingClassifier(
n_estimators=250, random_state=42, max_depth=5, # 增加树数量,降低深度
learning_rate=0.08, subsample=0.75, # 降低学习率和采样率
min_samples_split=25, min_samples_leaf=15, # 增加分裂要求
max_features=0.7
),
'XGBoost': xgb.XGBClassifier(
n_estimators=250, random_state=42, max_depth=5, # 增加树数量,降低深度
learning_rate=0.08, subsample=0.75, colsample_bytree=0.75,
reg_alpha=0.15, reg_lambda=0.15, gamma=0.1, # 增加正则化
scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
eval_metric='logloss', n_jobs=-1
),
'LightGBM': lgb.LGBMClassifier(
n_estimators=300, random_state=42, max_depth=7,
learning_rate=0.06, subsample=0.75, colsample_bytree=0.75,
reg_alpha=0.2, reg_lambda=0.2, min_child_samples=60, # 增加正则化和最小样本
num_leaves=35, boosting_type='gbdt', n_jobs=-1,
class_weight='balanced', min_split_gain=0.02,
subsample_freq=1, verbose=-1
)
}
print("\n模型性能比较:")
best_model = None
best_score_main = 0
best_threshold_main = 0.5
model_performance = {}
for name, model in models.items():
try:
print(f"训练 {name}...")
model.fit(X_train, y_train)
y_pred_proba_main = model.predict_proba(X_val)[:, 1]
score_main, threshold_main = custom_score_optimized(y_val, y_pred_proba_main)
auc_main = roc_auc_score(y_val, y_pred_proba_main)
y_pred_main = (y_pred_proba_main > threshold_main).astype(int)
pred_default_rate_main = y_pred_main.mean()
precision_main = precision_score(y_val, y_pred_main, zero_division=0)
recall_main = recall_score(y_val, y_pred_main, zero_division=0)
f1_main = f1_score(y_val, y_pred_main, zero_division=0)
print(f"{name:20} AUC: {auc_main:.4f}, 自定义评分: {score_main:.4f}, 阈值: {threshold_main:.3f}")
print(
f"{'':20} Precision: {precision_main:.4f}, Recall: {recall_main:.4f}, F1: {f1_main:.4f}, 预测违约率: {pred_default_rate_main:.4f}")
model_performance[name] = {
'model': model,
'score': score_main,
'auc': auc_main,
'threshold': threshold_main,
'precision': precision_main,
'recall': recall_main,
'f1': f1_main
}
if score_main > best_score_main:
best_score_main = score_main
best_model = model
best_threshold_main = threshold_main
except Exception as e:
print(f"{name} 训练失败: {str(e)}")
print(
f"\n最佳模型: {type(best_model).__name__}, 自定义评分: {best_score_main:.4f}, 最佳阈值: {best_threshold_main:.3f}")
# 8. 改进的模型集成
print("\n8. 改进的模型集成...")
if len(model_performance) >= 2:
# 选择表现最好的模型进行集成
top_models = sorted(model_performance.items(), key=lambda x: x[1]['score'], reverse=True)[:3]
print(f"集成模型候选: {[name for name, _ in top_models]}")
# 创建软投票集成
estimators_list = [(name, perf['model']) for name, perf in top_models]
voting_clf = VotingClassifier(estimators=estimators_list, voting='soft')
voting_clf.fit(X_train, y_train)
# 评估集成模型
y_pred_proba_ensemble = voting_clf.predict_proba(X_val)[:, 1]
ensemble_score, ensemble_threshold = custom_score_optimized(y_val, y_pred_proba_ensemble)
ensemble_auc = roc_auc_score(y_val, y_pred_proba_ensemble)
print(f"集成模型性能: AUC: {ensemble_auc:.4f}, 自定义评分: {ensemble_score:.4f}")
# 只有当集成模型明显更好时才使用
improvement_threshold = 0.003 # 进一步降低要求
if ensemble_score > best_score_main + improvement_threshold:
best_model = voting_clf
best_threshold_main = ensemble_threshold
print(f"使用集成模型作为最终模型,提升: {ensemble_score - best_score_main:.4f}")
else:
print("使用单一最佳模型作为最终模型")
else:
print("模型数量不足,跳过集成")
# 9. 模型解释
print("\n9. 模型解释...")
if hasattr(best_model, 'feature_importances_'):
# 树模型的特征重要性
importance_df = pd.DataFrame({
'feature': features_to_use,
'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(data=importance_df.head(15), y='feature', x='importance')
plt.title('Top 15 重要特征')
plt.tight_layout()
plt.show()
print("Top 10 重要特征:")
print(importance_df.head(10))
# 10. 稳健的测试集预测 - 更严格的违约率控制
print("\n10. 稳健的测试集预测...")
# 在整个训练集上重新训练最佳模型
final_model = copy.deepcopy(best_model)
final_model.fit(X_scaled, y)
# 预测测试集概率
test_predictions_proba = final_model.predict_proba(X_test_scaled)[:, 1]
# 使用最佳阈值进行预测
test_predictions = (test_predictions_proba > best_threshold_main).astype(int)
# 验证预测违约率
pred_default_rate = test_predictions.mean()
actual_default_rate = y.mean()
print(f"\n预测违约率验证:")
print(f"实际训练集违约率: {actual_default_rate:.4f}")
print(f"预测测试集违约率: {pred_default_rate:.4f}")
# 更严格的违约率控制
target_min_rate = actual_default_rate * 0.8 # 提高下限
target_max_rate = actual_default_rate * 1.25 # 降低上限
if pred_default_rate < target_min_rate or pred_default_rate > target_max_rate:
print("预测违约率需要调整...")
# 使用更保守的分位数调整,目标违约率更接近实际值
target_rate = min(max(actual_default_rate * 1.1, target_min_rate), target_max_rate)
adjusted_threshold = np.percentile(test_predictions_proba, 100 * (1 - target_rate))
# 在最佳阈值和调整阈值之间取更保守的平衡
balanced_threshold = (best_threshold_main * 0.6 + adjusted_threshold * 0.4)
test_predictions = (test_predictions_proba > balanced_threshold).astype(int)
best_threshold_main = balanced_threshold
pred_default_rate = test_predictions.mean()
print(f"平衡后阈值: {best_threshold_main:.3f}, 调整后预测违约率: {pred_default_rate:.4f}")
print(f"目标违约率范围: [{target_min_rate:.4f}, {target_max_rate:.4f}]")
# 11. 生成提交文件
print("\n11. 生成提交文件...")
submit_df = pd.DataFrame({
'uuid': df_test['company_id'],
'proba': test_predictions_proba,
'prediction': test_predictions
})
# 保存结果
submit_df.to_csv('submission.csv', index=False)
print("预测结果已保存到 submission.csv")
# 输出预测结果统计
print(f"\n预测结果统计:")
print(f"预测为违约的企业数量: {test_predictions.sum()}")
print(f"预测违约率: {test_predictions.mean():.4f}")
print(f"实际训练集违约率: {df_train['target'].mean():.4f}")
print(f"概率预测范围: [{test_predictions_proba.min():.4f}, {test_predictions_proba.max():.4f}]")
print(f"使用的阈值: {best_threshold_main:.3f}")
# 12. 最终模型评估
print("\n12. 最终模型评估...")
# 交叉验证
cv_scores = cross_val_score(final_model, X_scaled, y, cv=5, scoring='roc_auc')
print(f"5折交叉验证AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
# 计算最终评分 - 严格按照评分规则
y_val_pred_proba = final_model.predict_proba(X_val)[:, 1]
y_val_pred = (y_val_pred_proba > best_threshold_main).astype(int)
final_auc = roc_auc_score(y_val, y_val_pred_proba)
final_precision = precision_score(y_val, y_val_pred, zero_division=0)
final_recall = recall_score(y_val, y_val_pred, zero_division=0)
# 严格按照评分规则:0.5*AUC + 0.3*Recall + 0.2*Precision
final_score = 0.5 * final_auc + 0.3 * final_recall + 0.2 * final_precision
print(f"\n最终评分计算:")
print(f"AUC: {final_auc:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"最终分数: {final_score * 100:.2f}")
# 输出分类报告
print(f"\n分类报告:")
print(classification_report(y_val, y_val_pred))
# 特征分析总结
print(f"\n=== 分析总结 ===")
print(f"• 原始特征数量: {len(df_train.columns) - 2}")
print(f"• 使用特征数量: {len(features_to_use)}")
print(f"• 识别出的假指标: {len(potential_fake_features)}个")
print(f"• 训练集样本数: {len(df_train)}")
print(f"• 测试集样本数: {len(df_test)}")
print(f"• 最佳模型: {type(final_model).__name__}")
print(f"• 最佳阈值: {best_threshold_main:.3f}")
print(f"• 验证集AUC: {final_auc:.4f}")
print(f"• 交叉验证AUC: {cv_scores.mean():.4f}")
print(f"• 预估最终分数: {final_score * 100:.2f}")
print("\n优化分析完成!") 修改此程序
最新发布