import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取处理后的数据
df = pd.read_excel('分层去重后数据.xlsx')
# 一、增强的数据预处理
print("=" * 50)
print("第一步:增强的数据预处理")
print("=" * 50)
# 检查数据基本信息
print(f"数据形状: {df.shape}")
print(f"孕妇数量: {df['孕妇代码'].nunique()}")
print(f"各孕周阶段记录数量:")
print(df['孕周阶段'].value_counts())
# 1. 更严格的异常值检测与处理
def enhanced_outlier_detection(data, column, method='iqr', threshold=2.0):
"""
增强版异常值检测
method: 'iqr'或'zscore'
threshold: 对于iqr是倍数,对于zscore是标准差数
"""
if method == 'iqr':
Q1 = data[column].quantile(0.25)
Q3 = data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - threshold * IQR
upper_bound = Q3 + threshold * IQR
return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
else: # zscore
z_scores = np.abs(stats.zscore(data[column]))
return data[z_scores < threshold]
# 对关键变量应用更严格的异常值处理
df_clean = df.copy()
for col in ['Y染色体浓度', '孕妇BMI', '检测孕周']:
df_clean = enhanced_outlier_detection(df_clean, col, method='iqr', threshold=1.8)
print(f"异常值处理后数据记录数: {len(df_clean)}")
# 2. 多种数据转换尝试
def try_transformations(data, column):
"""尝试多种数据转换方法"""
transformations = {
'原始': data[column],
'对数': np.log1p(data[column]),
'平方根': np.sqrt(data[column]),
'Box-Cox': stats.boxcox(data[column] + 1)[0] if min(data[column]) > 0 else data[column]
}
# 计算每种转换的偏度和峰度
results = {}
for name, transformed in transformations.items():
skewness = stats.skew(transformed.dropna())
kurtosis = stats.kurtosis(transformed.dropna())
results[name] = {'偏度': skewness, '峰度': kurtosis}
return pd.DataFrame(results)
# 对Y染色体浓度尝试多种转换
transformation_results = try_transformations(df_clean, 'Y染色体浓度')
print("Y染色体浓度不同转换方法的偏度和峰度:")
print(transformation_results)
# 选择最佳转换(偏度最接近0的)
best_transform = transformation_results.T['偏度'].abs().idxmin()
print(f"选择的最佳转换方法: {best_transform}")
if best_transform == '对数':
df_clean['Y染色体浓度_transformed'] = np.log1p(df_clean['Y染色体浓度'])
elif best_transform == '平方根':
df_clean['Y染色体浓度_transformed'] = np.sqrt(df_clean['Y染色体浓度'])
elif best_transform == 'Box-Cox':
df_clean['Y染色体浓度_transformed'] = stats.boxcox(df_clean['Y染色体浓度'] + 1)[0]
else:
df_clean['Y染色体浓度_transformed'] = df_clean['Y染色体浓度']
# 可视化转换效果
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_clean['Y染色体浓度'], kde=True)
plt.title('Y染色体浓度原始分布')
plt.subplot(1, 2, 2)
sns.histplot(df_clean['Y染色体浓度_transformed'], kde=True)
plt.title(f'Y染色体浓度{best_transform}转换后分布')
plt.tight_layout()
plt.show()
# 对孕周进行中心化处理
mean_gestational_week = df_clean['检测孕周'].mean()
df_clean['检测孕周_centered'] = df_clean['检测孕周'] - mean_gestational_week
print(f"孕周均值: {mean_gestational_week:.2f}")
# 二、增强的特征工程
print("=" * 50)
print("第二步:增强的特征工程")
print("=" * 50)
# 创建更多有意义的特征
def create_advanced_features(data):
"""创建高级特征"""
df_enhanced = data.copy()
# BMI分类
df_enhanced['BMI类别'] = pd.cut(df_enhanced['孕妇BMI'],
bins=[0, 18.5, 25, 30, 100],
labels=['偏瘦', '正常', '超重', '肥胖'])
# 孕周阶段细化
df_enhanced['孕周阶段细化'] = pd.cut(df_enhanced['检测孕周'],
bins=[0, 12, 16, 20, 24, 30],
labels=['早孕早期', '早孕晚期', '中孕早期', '中孕晚期', '晚孕期'])
# 年龄分组
df_enhanced['年龄分组'] = pd.cut(df_enhanced['年龄'],
bins=[0, 25, 30, 35, 40, 100],
labels=['25以下', '25-30', '30-35', '35-40', '40以上'])
# 创建多项式特征
df_enhanced['孕周平方'] = df_enhanced['检测孕周'] ** 2
df_enhanced['BMI立方'] = df_enhanced['孕妇BMI'] ** 3
# 创建更多交互项
df_enhanced['年龄_BMI交互'] = df_enhanced['年龄'] * df_enhanced['孕妇BMI']
df_enhanced['年龄_孕周交互'] = df_enhanced['年龄'] * df_enhanced['检测孕周']
df_enhanced['BMI_孕周交互'] = df_enhanced['孕妇BMI'] * df_enhanced['检测孕周_centered']
return df_enhanced
df_enhanced = create_advanced_features(df_clean)
# 三、特征选择优化
print("=" * 50)
print("第三步:特征选择优化")
print("=" * 50)
# 准备特征数据
X_all = df_enhanced.select_dtypes(include=[np.number]).dropna()
X_all = X_all.drop(['Y染色体浓度', 'Y染色体浓度_transformed'], axis=1, errors='ignore')
y = df_enhanced['Y染色体浓度_transformed']
print(f"可用特征数量: {X_all.shape[1]}")
# 使用多种方法进行特征选择
def advanced_feature_selection(X, y, k=10):
"""使用多种方法进行特征选择"""
# 方差分析筛选
selector_anova = SelectKBest(score_func=f_regression, k='all')
selector_anova.fit(X, y)
anova_scores = pd.DataFrame({
'特征': X.columns,
'ANOVA得分': selector_anova.scores_
}).sort_values('ANOVA得分', ascending=False)
# 互信息筛选
selector_mi = SelectKBest(score_func=mutual_info_regression, k='all')
selector_mi.fit(X, y)
mi_scores = pd.DataFrame({
'特征': X.columns,
'互信息得分': selector_mi.scores_
}).sort_values('互信息得分', ascending=False)
# 随机森林重要性
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)
rf_importance = pd.DataFrame({
'特征': X.columns,
'随机森林重要性': rf.feature_importances_
}).sort_values('随机森林重要性', ascending=False)
return anova_scores, mi_scores, rf_importance
# 执行特征选择
anova_scores, mi_scores, rf_importance = advanced_feature_selection(X_all, y)
print("ANOVA特征得分前10:")
print(anova_scores.head(10))
print("\n互信息特征得分前10:")
print(mi_scores.head(10))
print("\n随机森林特征重要性前10:")
print(rf_importance.head(10))
# 选择最佳特征(取三种方法的前5名)
top_features_anova = set(anova_scores.head(5)['特征'])
top_features_mi = set(mi_scores.head(5)['特征'])
top_features_rf = set(rf_importance.head(5)['特征'])
# 取三种方法的并集
selected_features = list(top_features_anova.union(top_features_mi).union(top_features_rf))
print(f"\n最终选择的特征: {selected_features}")
X_selected = X_all[selected_features]
# 四、模型优化与选择
print("=" * 50)
print("第四步:模型优化与选择")
print("=" * 50)
# 尝试多种回归模型
def compare_models(X, y, cv=5):
"""比较多种回归模型"""
models = {
'线性回归': sm.OLS(y, sm.add_constant(X)).fit(),
'岭回归': Ridge(alpha=1.0),
'Lasso回归': Lasso(alpha=0.1),
'梯度提升': GradientBoostingRegressor(n_estimators=100, random_state=42),
'随机森林': RandomForestRegressor(n_estimators=100, random_state=42)
}
results = {}
for name, model in models.items():
if name == '线性回归':
# 对于statsmodels模型,直接使用R²
results[name] = {
'R²': model.rsquared,
'调整R²': model.rsquared_adj,
'AIC': model.aic,
'BIC': model.bic
}
else:
# 对于sklearn模型,使用交叉验证
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
results[name] = {
'R²': cv_scores.mean(),
'R²标准差': cv_scores.std(),
'AIC': None,
'BIC': None
}
return pd.DataFrame(results).T
# 比较不同模型
model_comparison = compare_models(X_selected, y)
print("不同模型性能比较:")
print(model_comparison.sort_values('R²', ascending=False))
# 选择最佳模型
best_model_name = model_comparison['R²'].idxmax()
print(f"\n选择的最佳模型: {best_model_name}")
# 优化最佳模型的超参数
def optimize_hyperparameters(X, y, model_type='gradient_boosting'):
"""优化模型的超参数"""
if model_type == 'gradient_boosting':
param_grid = {
'n_estimators': [50, 100, 200],
'learning_rate': [0.01, 0.05, 0.1],
'max_depth': [3, 4, 5],
'min_samples_split': [2, 5, 10]
}
model = GradientBoostingRegressor(random_state=42)
elif model_type == 'random_forest':
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
model = RandomForestRegressor(random_state=42)
else: # 线性模型不需要超参数优化
return None
grid_search = GridSearchCV(
model, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=0
)
grid_search.fit(X, y)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
return grid_search.best_estimator_
# 根据选择的模型类型进行优化
if best_model_name in ['梯度提升', '随机森林']:
model_type = 'gradient_boosting' if best_model_name == '梯度提升' else 'random_forest'
best_model = optimize_hyperparameters(X_selected, y, model_type)
else:
# 对于线性模型,直接使用statsmodels的结果
best_model = sm.OLS(y, sm.add_constant(X_selected)).fit()
# 五、模型评估与验证
print("=" * 50)
print("第五步:模型评估与验证")
print("=" * 50)
# 分层交叉验证
patient_ids = df_enhanced['孕妇代码'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []
for fold, (train_idx, test_idx) in enumerate(kf.split(patient_ids)):
train_patients = patient_ids[train_idx]
test_patients = patient_ids[test_idx]
train_data = df_enhanced[df_enhanced['孕妇代码'].isin(train_patients)]
test_data = df_enhanced[df_enhanced['孕妇代码'].isin(test_patients)]
X_train = train_data[selected_features]
X_test = test_data[selected_features]
y_train = train_data['Y染色体浓度_transformed']
y_test = test_data['Y染色体浓度_transformed']
# 训练模型
if hasattr(best_model, 'fit'): # sklearn模型
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)
else: # statsmodels模型
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
y_pred = model.predict(sm.add_constant(X_test))
# 计算指标
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
cv_results.append({'fold': fold + 1, 'R²': r2, 'RMSE': rmse})
print(f"Fold {fold + 1}: R² = {r2:.4f}, RMSE = {rmse:.4f}")
cv_df = pd.DataFrame(cv_results)
print(f"\n交叉验证平均结果: R² = {cv_df['R²'].mean():.4f} (±{cv_df['R²'].std():.4f}), "
f"RMSE = {cv_df['RMSE'].mean():.4f} (±{cv_df['RMSE'].std():.4f})")
# 六、最终模型训练与结果分析
print("=" * 50)
print("第六步:最终模型训练与结果分析")
print("=" * 50)
# 使用全部数据训练最终模型
if hasattr(best_model, 'fit'): # sklearn模型
best_model.fit(X_selected, y)
y_pred = best_model.predict(X_selected)
final_r2 = r2_score(y, y_pred)
# 获取特征重要性(如果可用)
if hasattr(best_model, 'feature_importances_'):
feature_importance = pd.DataFrame({
'特征': X_selected.columns,
'重要性': best_model.feature_importances_
}).sort_values('重要性', ascending=False)
print("\n特征重要性排序:")
print(feature_importance)
else: # statsmodels模型
final_model = best_model
y_pred = final_model.predict(sm.add_constant(X_selected))
final_r2 = final_model.rsquared
print("\n最终模型摘要:")
print(final_model.summary())
# 模型显著性检验
print(f"F统计量: {final_model.fvalue:.4f}, p值: {final_model.f_pvalue:.6f}")
if final_model.f_pvalue < 0.05:
print("✅ 整体模型显著 (p < 0.05)")
else:
print("❌ 整体模型不显著 (p ≥ 0.05)")
print(f"\n最终模型R²: {final_r2:.4f}")
# 可视化最终结果
plt.figure(figsize=(15, 10))
# 1. 模型预测值与实际值对比
plt.subplot(2, 3, 1)
plt.scatter(y, y_pred)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.xlabel('实际值')
plt.ylabel('预测值')
plt.title('预测值与实际值对比')
# 2. 残差图
plt.subplot(2, 3, 2)
residuals = y - y_pred
plt.scatter(y_pred, residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('预测值')
plt.ylabel('残差')
plt.title('残差图')
# 3. 残差分布
plt.subplot(2, 3, 3)
sns.histplot(residuals, kde=True)
plt.xlabel('残差')
plt.title('残差分布')
# 4. 特征重要性(如果可用)
if 'feature_importance' in locals():
plt.subplot(2, 3, 4)
plt.barh(range(len(feature_importance)), feature_importance['重要性'])
plt.yticks(range(len(feature_importance)), feature_importance['特征'])
plt.xlabel('特征重要性')
plt.title('特征重要性排序')
# 5. 交叉验证结果
plt.subplot(2, 3, 5)
plt.bar(range(len(cv_df)), cv_df['R²'])
plt.axhline(y=cv_df['R²'].mean(), color='r', linestyle='--', label='平均R²')
plt.xlabel('交叉验证折数')
plt.ylabel('R²')
plt.title('交叉验证结果')
plt.legend()
plt.tight_layout()
plt.show()
# 保存最终结果
results_df = pd.DataFrame({
'实际值': y,
'预测值': y_pred,
'残差': residuals
})
results_df.to_excel('模型预测结果.xlsx', index=False)
print("预测结果已保存到 '模型预测结果.xlsx'")
# 模型解释
print("\n模型解释:")
if hasattr(best_model, 'feature_importances_') and 'feature_importance' in locals():
top_feature = feature_importance.iloc[0]
print(f"- 最重要的特征是 '{top_feature['特征']}',重要性为 {top_feature['重要性']:.4f}")
elif hasattr(final_model, 'params'):
# 对于线性模型,解释系数
for feature, coef in final_model.params.items():
if feature != 'const':
direction = "正" if coef > 0 else "负"
print(f"- 特征 '{feature}' 对Y染色体浓度有{direction}向影响 (系数: {coef:.4f})")
print(f"\n模型性能总结:")
print(f"- 最终R²: {final_r2:.4f}")
print(f"- 交叉验证平均R²: {cv_df['R²'].mean():.4f} (±{cv_df['R²'].std():.4f})")
print(f"- 交叉验证平均RMSE: {cv_df['RMSE'].mean():.4f} (±{cv_df['RMSE'].std():.4f})") 这是我有错误的代码
最新发布