查看问题2的代码后输出完整的问题3代码
问题2的代码为
import pandas as pd
import numpy as np
from scipy.optimize import minimize_scalar
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
class NIPTOptimizer:
def __init__(self, data_path):
self._validate_data(data_path)
self.data = pd.read_excel(data_path)
self._ensure_column_consistency()
# BMI 分组定义
self.bmi_groups = [(20.0, 28.0), (28.0, 32.0), (32.0, 36.0), (36.0, 40.0)]
self.group_labels = ['正常低值', '正常高值', '中度偏高', '高度偏高']
def _validate_data(self, path):
"""验证数据路径是否合法"""
if not isinstance(path, str) or not path.endswith(('.xls', '.xlsx')):
raise ValueError("无效的数据文件路径")
def _ensure_column_consistency(self):
"""确保关键列存在"""
required_cols = [
'孕妇BMI', '检测孕周', '检测抽血次数',
'GC含量', 'Y染色体浓度',
'13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值'
]
for col in required_cols:
if col not in self.data.columns:
raise ValueError(f"缺少必要字段: {col}")
def get_weights(self, group_label):
"""根据不同 BMI 分组设置风险权重"""
return {
'正常低值': (0.7, 0.3),
'正常高值': (0.6, 0.4),
'中度偏高': (0.5, 0.5),
'高度偏高': (0.3, 0.7)
}[group_label]
def compute_data_quality(self, group_data):
"""
计算数据质量:
▪ 相对质量 = 抽血次数 / 组内均值
▪ 绝对质量 = 抽血次数
▪ 综合质量 = 0.3 * 相对质量 + 0.7 * 绝对质量
"""
avg_draws = group_data['检测抽血次数'].mean()
group_data['相对数据质量'] = group_data['检测抽血次数'] / avg_draws
group_data['绝对数据质量'] = group_data['检测抽血次数']
group_data['综合数据质量'] = 0.3 * group_data['相对数据质量'] + 0.7 * group_data['绝对数据质量']
return group_data
def _calculate_rt_risk(self, t_min, current_week, bmi):
"""
窗口期风险函数:
▪ 随孕周增长而上升
▪ 随实际 BMI 值增大而上升(乘性影响)
▪ 使用 bmi 来调节风险放大程度
"""
base_risk = 0.1 + 0.05 * max(0, current_week - t_min)
base_bmi = 20.0
bmi_factor = max(0, bmi - base_bmi) # 只对高于基准的 BMI 产生影响
risk_multiplier = 1 + 0.02 * bmi_factor # 0.02 为可调参数
# 抽血次数惩罚因子
blood_draw_count = self.data[self.data['检测孕周'] == current_week]['检测抽血次数'].mean()
blood_penalty = 1 + 0.1 * max(0, 3 - blood_draw_count)
# 最终风险
rt_risk = base_risk * risk_multiplier * blood_penalty
return min(rt_risk, 0.8)
def _calculate_re_risk(self, group_data, t):
"""检测误差风险计算"""
subset = self._get_time_window_data(group_data, t)
if len(subset) < 3:
return self._handle_insufficient_data(group_data)
# GC含量异常检测
gc_abnormal = self._check_gc_content(subset)
# CI 宽度计算
ci_widths = self._calculate_ci_widths(subset)
# Y染色体浓度变异系数
cv_y = self._calculate_cv_y(subset)
return 0.4 * gc_abnormal + 0.3 * np.mean(ci_widths) + 0.3 * cv_y
def _get_time_window_data(self, group_data, t):
"""获取时间窗口数据"""
return group_data[
(group_data['检测孕周'] >= t - 0.5) &
(group_data['检测孕周'] <= t + 0.5)
].copy()
def _check_gc_content(self, subset):
"""GC含量异常检测"""
return subset['GC含量'].apply(
lambda x: 1 if x < 39.5 or x > 60 else 0
).mean()
def _calculate_ci_widths(self, subset):
"""置信区间宽度计算"""
z_cols = ['13号染色体的Z值', '18号染色体的Z值', '21号染色体的Z值']
return [
1.96 * (subset[col].std() / np.sqrt(len(subset[col])))
for col in z_cols if col in subset.columns
]
def _calculate_cv_y(self, subset):
"""Y染色体浓度变异系数"""
y_conc = subset['Y染色体浓度']
return y_conc.std() / y_conc.mean() if y_conc.mean() > 0 else 0.3
def _handle_insufficient_data(self, group_data):
"""数据不足处理策略"""
blood_draw = group_data['检测抽血次数'].mean()
return 0.8 - 0.1 * min(blood_draw, 3)
def calculate_penalized_re_risk(self, group_data, t, epsilon=0.001):
"""加入数据质量惩罚的检测误差风险"""
re = self._calculate_re_risk(group_data, t)
dq = group_data['综合数据质量'].mean()
return re * (1 / (dq + epsilon))
def estimate_t_min(self, group_data):
"""Y浓度达标孕周估计"""
try:
group_data = group_data.sort_values('检测孕周')
if len(group_data) < 5:
return 12.0
lowess = sm.nonparametric.lowess(
group_data['Y染色体浓度'],
group_data['检测孕周'],
frac=0.3, it=3
)
smooth_df = pd.DataFrame(lowess, columns=['孕周', '浓度'])
valid_weeks = smooth_df[smooth_df['浓度'] >= 4.0]['孕周']
return valid_weeks.min() if not valid_weeks.empty else 12.0
except Exception as e:
print(f"LOWESS估计异常: {str(e)}")
return 12.0
def _objective_function(self, t, group_data, weights):
"""目标函数封装:逐样本计算 rt_risk 并取平均"""
t_min = group_data['检测孕周'].max()
# 逐样本计算 rt_risk
rt_risks = [
self._calculate_rt_risk(t_min, t, row['孕妇BMI'])
for _, row in group_data.iterrows()
]
rt = np.mean(rt_risks) # 取平均值作为组级窗口期风险
re = self._calculate_re_risk(group_data, t)
dq = group_data['综合数据质量'].mean()
epsilon = 0.01
re *= (1 / (dq + epsilon))
return weights[0] * re + weights[1] * rt
def optimize(self):
results, risk_analysis = [], []
for label, (bmi_min, bmi_max) in zip(self.group_labels, self.bmi_groups):
group_data = self.data[
(self.data['孕妇BMI'] >= bmi_min) &
(self.data['孕妇BMI'] < bmi_max)
].copy()
if len(group_data) < 2:
continue # 跳过样本过少的组
group_data = self.compute_data_quality(group_data)
t_min = self.estimate_t_min(group_data)
weights = self.get_weights(label)
result = minimize_scalar(
self._objective_function,
bounds=(max(t_min, 10.0), 25.0),
method='bounded',
args=(group_data, weights)
)
# 收集结果
rt_risks = [
self._calculate_rt_risk(t_min, result.x, row['孕妇BMI'])
for _, row in group_data.iterrows()
]
rt = np.mean(rt_risks)
results.append({
'分组标签': label,
'BMI区间': f"({bmi_min:.1f}, {bmi_max:.1f})",
'最佳检测周': round(result.x, 4),
'最小总风险': round(result.fun, 4),
'平均窗口期风险': round(rt, 4),
'相对数据质量': group_data['相对数据质量'].mean(),
'绝对数据质量': group_data['绝对数据质量'].mean(),
'综合数据质量': group_data['综合数据质量'].mean()
})
risk_analysis.append({
'分组标签': label,
't_min': t_min,
'窗口期风险': rt,
'检测风险': self.calculate_penalized_re_risk(group_data, result.x)
})
return pd.DataFrame(results), pd.DataFrame(risk_analysis)
def visualize(self, results_df, risk_df):
"""优化后的可视化模块"""
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
fig = plt.figure(figsize=(18, 12))
# 子图1:最佳检测时间分布
ax1 = fig.add_subplot(221)
sns.barplot(x='分组标签', y='最佳检测周', data=results_df, ci=None, palette='rocket')
ax1.set_title('不同BMI组最佳检测时间', fontsize=14)
ax1.set_xlabel('BMI分组')
ax1.set_ylabel('最佳检测周(周)')
# 子图2:数据质量与风险关系
ax2 = fig.add_subplot(222)
results_df[['相对数据质量', '综合数据质量', '最小总风险']].plot(kind='bar', ax=ax2, colormap='viridis', edgecolor='black')
ax2.set_title('数据质量与总风险关系', fontsize=14)
ax2.set_xticklabels(results_df['分组标签'], rotation=45)
# 子图3:风险对比分析
ax3 = fig.add_subplot(223)
risk_df[['窗口期风险', '检测风险']].plot(kind='bar', ax=ax3, color=['#1f77b4', '#ff7f0e'], width=0.7)
ax3.set_title('风险构成分析', fontsize=14)
ax3.grid(axis='y', linestyle='--', alpha=0.7)
# 子图4:总风险分布密度
ax4 = fig.add_subplot(224)
sns.kdeplot(data=results_df, x='最小总风险', fill=True, ax=ax4, color='#2ca02c', thresh=0.05)
ax4.set_title('总风险分布密度', fontsize=14)
ax4.set_xlabel('最小总风险值')
ax4.set_ylabel('密度')
plt.tight_layout()
plt.savefig('optimization_results.png', dpi=300, bbox_inches='tight')
plt.show()
# 示例运行
if __name__ == "__main__":
optimizer = NIPTOptimizer("男胎检测数据_预处理.xlsx")
optimal_df, risk_df = optimizer.optimize()
print("=== 优化结果 ===")
print(optimal_df[['分组标签', 'BMI区间', '最佳检测周', '最小总风险', '平均窗口期风险']])
print("\n=== 风险分析 ===")
print(risk_df[['分组标签', '窗口期风险', '检测风险']])
optimizer.visualize(optimal_df, risk_df)