import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test
from itertools import combinations
#设置格式便于展
plt.rcParams["font.size"] = 10
class SurvivalAnalyzer:
def __init__(self):
self.best_cutpoints = [] # 最终有效分割点(深度、分割值、P值)
self.groups = [] # 最终分组的样本索引
self.step_records = [] # 记录每一步分割过程的详细信息
self.group_id = 0 # 用于标记最终分组的唯一ID
self.optimal_times = {} # 存储每个分组的最佳检测时点
def log_rank_test(self, time, event, groups):
"""Log-Rank检验(多组比较时校正P值)"""
unique_groups = np.unique(groups)
if len(unique_groups) <= 1:
return 1.0
p_values = []
for g1, g2 in combinations(unique_groups, 2):
idx1 = groups == g1
idx2 = groups == g2
result = logrank_test(time[idx1], time[idx2], event[idx1], event[idx2])
p_values.append(result.p_value)
corrected_p = min(np.min(p_values) * len(p_values), 1.0) # Bonferroni校正
return corrected_p
def find_best_cutpoint(self, time, event, bmi_values, min_samples=20):
"""寻找最佳分割点(遍历所有可能分割点,选P值最小的)"""
sorted_idx = np.argsort(bmi_values)
sorted_bmi = bmi_values[sorted_idx]
sorted_time = time[sorted_idx]
sorted_event = event[sorted_idx]
n = len(sorted_bmi)
best_p, best_cut = 1.0, None
# 遍历所有满足最小样本量的分割点
for i in range(min_samples, n - min_samples):
cut = sorted_bmi[i]
g1_idx = sorted_idx[:i] # 左子组(BMI < cut)
g2_idx = sorted_idx[i:] # 右子组(BMI >= cut)
if len(g1_idx) < min_samples or len(g2_idx) < min_samples:
continue
# 计算当前分割点的Log-Rank P值
result = logrank_test(time[g1_idx], time[g2_idx], event[g1_idx], event[g2_idx])
if result.p_value < best_p:
best_p = result.p_value
best_cut = cut
return best_cut, best_p
def recursive_partitioning(self, time, event, bmi_values, global_idx,
current_depth=0, min_samples=20, p_threshold=0.05, max_depth=3):
"""改进的递归二分法:记录过程+全局索引跟踪"""
# 当前子组的基本信息
current_bmi = bmi_values[global_idx] # 当前子组的BMI值
current_size = len(global_idx) # 当前子组样本量
current_bmi_sorted = np.sort(current_bmi) # 排序后的BMI(用于可视化)
# 记录:当前子组的初始状态
step_info = {
"depth": current_depth,
"subgroup_id": f"D{current_depth}_G{len(self.step_records) + 1}", # 子组唯一标识(深度_序号)
"sample_size": current_size,
"bmi_range": (current_bmi.min(), current_bmi.max()),
"bmi_sorted": current_bmi_sorted,
"best_cut": None,
"logrank_p": None,
"is_partitioned": False,
"left_subgroup": None, # 左子组(BMI <= cut)的全局索引
"right_subgroup": None, # 右子组(BMI > cut)的全局索引
"stop_reason": None # 终止分割的原因
}
# 检查终止条件:样本量不足/深度超限/P值不显著
if current_size < 2 * min_samples:
step_info["stop_reason"] = f"样本量不足({current_size} < {2 * min_samples})"
self.step_records.append(step_info)
self.groups.append(
{"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]})
self.group_id += 1
return
if current_depth >= max_depth:
step_info["stop_reason"] = f"达到最大深度({current_depth} >= {max_depth})"
self.step_records.append(step_info)
self.groups.append(
{"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]})
self.group_id += 1
return
# 寻找当前子组的最佳分割点
best_cut, best_p = self.find_best_cutpoint(
time=time[global_idx],
event=event[global_idx],
bmi_values=current_bmi,
min_samples=min_samples
)
# 检查分割点有效性
if best_cut is None or best_p >= p_threshold:
step_info["logrank_p"] = best_p if best_p is not None else "N/A"
step_info["stop_reason"] = f"Log-Rank P值不显著(p={step_info['logrank_p']:.4f} >= {p_threshold})"
self.step_records.append(step_info)
self.groups.append(
{"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]})
self.group_id += 1
return
# 分割有效:记录分割信息并递归处理子组
step_info["best_cut"] = best_cut
step_info["logrank_p"] = best_p
step_info["is_partitioned"] = True
self.best_cutpoints.append((current_depth, best_cut, best_p)) # 加入最终有效分割点
# 划分左、右子组(基于全局索引)
left_mask = current_bmi <= best_cut
right_mask = current_bmi > best_cut
left_global_idx = global_idx[left_mask]
right_global_idx = global_idx[right_mask]
step_info["left_subgroup"] = {"size": len(left_global_idx),
"bmi_range": (current_bmi[left_mask].min(), current_bmi[left_mask].max())}
step_info["right_subgroup"] = {"size": len(right_global_idx),
"bmi_range": (current_bmi[right_mask].min(), current_bmi[right_mask].max())}
self.step_records.append(step_info)
# 递归处理左、右子组
self.recursive_partitioning(time, event, bmi_values, left_global_idx, current_depth + 1, min_samples,
p_threshold, max_depth)
self.recursive_partitioning(time, event, bmi_values, right_global_idx, current_depth + 1, min_samples,
p_threshold, max_depth)
def determine_optimal_time(self, time, event):
"""
确定每个分组的最佳NIPT检测时点
核心规则:在Kaplan-Meier生存曲线上,找到生存概率S(t)=0.1对应的最小孕周t*
若t*≤12周则取t*,若t^*>12周则取"S(t)≤0.1且最接近12周"的时点
"""
print("\n2.2 最佳NIPT时点确定模型(优化类)")
print("核心目标:在'高检测成功率(准确性)'与'早期干预(低风险)'之间寻求最优平衡。")
print("时点选择准则(平衡准确性与风险)")
print("临床目标:在'检测准确性(达标率≥90%)'与'早期检测(降低治疗风险)'间找最优解,依据文档3的风险阶段:")
print("\t早期(≤12 周):风险低;")
print("\t中期(13-27 周):风险高;")
print("\t晚期(≥28 周):风险极高。")
print(
"核心规则:在每组Kaplan-Meier生存曲线上,选取生存概率S(t)=0.1对应的最小孕周t^*(即仅10%孕妇未达标,90%已达标),")
print(" 若t^*≤12周则取t^*,若t^*>12周则取'S(t)≤0.1且最接近12周'的时点。\n")
# 按BMI范围升序排列分组
sorted_groups = sorted(self.groups, key=lambda x: x["bmi_range"][0])
# 为每个分组确定最佳检测时点
for group in sorted_groups:
group_id = group["group_id"]
group_idx = group["global_idx"]
bmi_range = group["bmi_range"]
# 拟合Kaplan-Meier曲线
kmf = KaplanMeierFitter()
kmf.fit(
durations=time[group_idx],
event_observed=event[group_idx],
alpha=0.05
)
# 获取生存曲线数据
survival_function = kmf.survival_function_
times = survival_function.index.values
survival_probs = survival_function.iloc[:, 0].values
# 找到生存概率S(t)=0.1对应的最小孕周t*
# 首先找到所有生存概率≤0.1的时间点
threshold = 0.1
valid_indices = np.where(survival_probs <= threshold)[0]
if len(valid_indices) == 0:
# 如果没有达标率≥90%的时间点,取最后一个时间点
optimal_time = times[-1]
reason = "未达到90%达标率,取最后一个观测时间点"
else:
# 找到达标率首次达到90%的时间点
t_star = times[valid_indices[0]]
# 应用核心规则
if t_star <= 12:
optimal_time = t_star
reason = f"t*={t_star:.1f}≤12周,直接选取"
else:
# 找到≤12周且最接近12周的时间点,且该时间点达标率≥90%
before_12_indices = np.where((times <= 12) & (survival_probs <= threshold))[0]
if len(before_12_indices) > 0:
optimal_time = times[before_12_indices[-1]]
reason = f"t*={t_star:.1f}>12周,选取≤12周且最接近12周的达标时点"
else:
# 如果12周前没有达标率≥90%的时间点,取首次达标时间点
optimal_time = t_star
reason = f"12周前未达到90%达标率,选取首次达标时间点t*={t_star:.1f}"
# 记录最佳时点及其达标率
# 找到最佳时点对应的生存概率
closest_idx = np.argmin(np.abs(times - optimal_time))
survival_prob_at_optimal = survival_probs[closest_idx]
success_rate = 1 - survival_prob_at_optimal
self.optimal_times[group_id] = {
"bmi_range": bmi_range,
"optimal_time": optimal_time,
"success_rate": success_rate,
"reason": reason,
"t_star": t_star if 't_star' in locals() else None
}
# 输出结果
print(f"组{group_id}(BMI [{bmi_range[0]:.1f}, {bmi_range[1]:.1f}]):")
print(f" 最佳检测时点: {optimal_time:.1f}周")
print(f" 此时点达标率: {success_rate:.2%}")
print(f" 选择原因: {reason}\n")
def plot_partition_process(self, save_path=None):
"""可视化二分法分割全过程"""
if not self.step_records:
print("无分割过程记录,无法可视化")
return
# 按深度分组,统计每一层的子组数量
depth_groups = {}
for step in self.step_records:
depth = step["depth"]
if depth not in depth_groups:
depth_groups[depth] = []
depth_groups[depth].append(step)
max_depth = max(depth_groups.keys()) if depth_groups else 0
max_subgroups_per_depth = max(len(subgroups) for subgroups in depth_groups.values()) if depth_groups else 1
# 设置画布大小(根据深度和子组数量动态调整)
fig, axes = plt.subplots(
nrows=max_depth + 1,
ncols=max_subgroups_per_depth,
figsize=(4 * max_subgroups_per_depth, 3 * (max_depth + 1)),
squeeze=False # 确保即使只有1个子图也保持2D数组结构
)
# 遍历每一层、每个子组,绘制分割详情
for depth in sorted(depth_groups.keys()):
subgroups = depth_groups[depth]
for idx, step in enumerate(subgroups):
ax = axes[depth, idx]
# 1. 绘制当前子组的BMI分布直方图
bmi_sorted = step["bmi_sorted"]
ax.hist(bmi_sorted, bins=15, alpha=0.6, color="#87CEEB", edgecolor="black", linewidth=0.5,
label=f"样本数:{step['sample_size']}")
# 2. 若有有效分割点,绘制竖线标记
if step["is_partitioned"]:
ax.axvline(x=step["best_cut"], color="red", linewidth=2, linestyle="--",
label=f"分割点:{step['best_cut']:.2f}")
# 标注Log-Rank P值
ax.text(0.02, 0.95, f"Log-Rank p={step['logrank_p']:.4f}",
transform=ax.transAxes, verticalalignment="top",
bbox=dict(boxstyle="round", facecolor="white", alpha=0.8),
fontsize=9)
# 3. 标注子组信息和终止原因
title = f"{step['subgroup_id']} | BMI: [{step['bmi_range'][0]:.1f}, {step['bmi_range'][1]:.1f}]"
if step["stop_reason"]:
title += f"\n终止:{step['stop_reason']}"
ax.set_title(title, fontsize=10, pad=10)
# 4. 设置坐标轴
ax.set_xlabel("BMI值", fontsize=9)
ax.set_ylabel("样本频数", fontsize=9)
ax.legend(fontsize=8, loc="upper right")
ax.grid(alpha=0.3, linestyle=":")
# 5. 隐藏空的子图(若某一层子组数量少于最大数量)
if idx >= len(subgroups) and max_subgroups_per_depth > len(subgroups):
ax.axis("off")
# 调整子图间距,避免重叠
plt.tight_layout(pad=2.0)
# 保存或显示图片
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
print(f"分割过程图已保存至:{save_path}")
plt.show()
def plot_survival_curves(self, time, event, save_path=None):
"""基于最终分组绘制生存曲线,并标记最佳检测时点"""
if not self.groups:
print("无最终分组,无法绘制生存曲线")
return
kmf = KaplanMeierFitter()
fig, ax = plt.subplots(figsize=(12, 6))
# 按BMI范围升序排列分组(确保曲线顺序合理)
sorted_groups = sorted(self.groups, key=lambda x: x["bmi_range"][0])
# 绘制每个分组的生存曲线
colors = plt.cm.Set3(np.linspace(0, 1, len(sorted_groups))) # 配色方案
for i, group in enumerate(sorted_groups):
group_id = group["group_id"]
group_idx = group["global_idx"]
bmi_range = group["bmi_range"]
group_label = f"组{group_id}:BMI [{bmi_range[0]:.1f}, {bmi_range[1]:.1f}](n={len(group_idx)})"
# 拟合Kaplan-Meier曲线
kmf.fit(
durations=time[group_idx],
event_observed=event[group_idx],
label=group_label,
alpha=0.05
)
kmf.plot_survival_function(ax=ax, ci_show=True, linewidth=2, color=colors[i], ci_alpha=0.2)
# 标记最佳检测时点
if group_id in self.optimal_times:
optimal_time = self.optimal_times[group_id]["optimal_time"]
ax.axvline(x=optimal_time, color=colors[i], linestyle=":", linewidth=2)
ax.scatter(optimal_time, 0.1, color=colors[i], s=100, zorder=5)
ax.text(optimal_time + 0.5, 0.12, f"最佳时点: {optimal_time:.1f}周",
color=colors[i], fontsize=9)
# 标记12周界限和90%达标率线
ax.axvline(x=12, color='gray', linestyle='-', alpha=0.5, label='12周界限')
ax.axhline(y=0.1, color='black', linestyle='--', alpha=0.5, label='90%达标率线')
# 设置图表样式
ax.set_title("不同BMI组男胎Y染色体达标生存曲线(含最佳检测时点)", fontsize=14, pad=15)
ax.set_xlabel("检测孕周(周)", fontsize=12)
ax.set_ylabel("未达标率(生存概率)", fontsize=12)
ax.legend(title="BMI分组", loc="upper right", fontsize=10, title_fontsize=11)
ax.grid(alpha=0.3, linestyle=":")
ax.set_xlim(10, 25) # 限制孕周范围(与数据筛选一致)
ax.set_ylim(0, 1) # 生存概率范围
# 保存或显示图片
plt.tight_layout()
if save_path:
plt.savefig(save_path, dpi=300, bbox_inches="tight")
print(f"生存曲线图已保存至:{save_path}")
plt.show()
def process_nipt_data(file_path):
"""数据预处理函数(删除原始样本数、Y列信息、男胎筛选的打印语句)"""
# 读取数据
try:
df_raw = pd.read_excel(file_path, sheet_name="男胎检测数据")
except ValueError:
df_raw = pd.read_excel(file_path, sheet_name=0)
# 手动指定列名(根据实际数据调整)
ACTUAL_WEEK_COLUMN = "检测孕周"
ACTUAL_BMI_COLUMN = "孕妇BMI"
ACTUAL_Y_COLUMN = "Y染色体浓度"
ACTUAL_ID_COLUMN = "孕妇代码"
GENDER_COLUMN = "胎儿性别"
# 检查必需列
required_cols = [ACTUAL_WEEK_COLUMN, ACTUAL_BMI_COLUMN, ACTUAL_Y_COLUMN, ACTUAL_ID_COLUMN]
for col in required_cols:
if col not in df_raw.columns:
raise KeyError(f"未找到列:{col},请检查列名是否正确")
# 转换Y染色体浓度列为数值类型
try:
df_raw[ACTUAL_Y_COLUMN] = pd.to_numeric(df_raw[ACTUAL_Y_COLUMN], errors='coerce')
except Exception as e:
print(f"转换数值类型时出错:{e}")
# 筛选男胎样本
if GENDER_COLUMN in df_raw.columns:
df_male = df_raw[df_raw[GENDER_COLUMN].str.contains('男', na=False)].reset_index(drop=True)
else:
df_male = df_raw[df_raw[ACTUAL_Y_COLUMN].notna()].reset_index(drop=True)
if len(df_male) == 0:
raise ValueError("筛选后男胎样本数为0,无法继续分析")
# 阈值自动检测(Y染色体浓度达标标准)
print("\n=== 阈值自动检测(Y染色体浓度达标标准)===")
thresholds = [0.01, 0.04, 1.0, 4.0, 10.0] # 常见阈值候选
threshold_results = []
for threshold in thresholds:
count = (df_male[ACTUAL_Y_COLUMN] >= threshold).sum()
percentage = count / len(df_male) * 100
threshold_results.append((threshold, count, percentage))
print(f"阈值={threshold}:达标数={count}({percentage:.1f}%)")
# 选择最佳阈值(优先10%-90%达标率,否则选最接近50%的)
valid_thresholds = [(t, c, p) for t, c, p in threshold_results if 10 <= p <= 90]
if valid_thresholds:
best_threshold = min(valid_thresholds, key=lambda x: abs(x[2] - 50))
else:
best_threshold = max(threshold_results, key=lambda x: x[1]) # fallback:选达标数最多的
print(f"自动选择最佳阈值:{best_threshold[0]}(达标数={best_threshold[1]},占比={best_threshold[2]:.1f}%)")
df_male["is_qualified"] = (df_male[ACTUAL_Y_COLUMN] >= best_threshold[0]).astype(int)
# 孕周处理
print("\n=== 孕周处理 ===")
def convert_gestational_week(week_str):
"""将"Xw+Yd"格式转换为小数孕周"""
if pd.isna(week_str) or "w" not in str(week_str):
return np.nan
week_part, day_part = str(week_str).split("w", 1)
weeks = float(week_part.strip())
days = float(day_part.strip().split("+")[1]) if "+" in day_part else 0.0
return round(weeks + days / 7, 2)
df_male["gestational_week"] = df_male[ACTUAL_WEEK_COLUMN].apply(convert_gestational_week)
print(f"孕周转换后缺失值数量:{df_male['gestational_week'].isna().sum()}")
# 筛选10-25周的有效样本
df_male = df_male[(df_male["gestational_week"] >= 10.0) &
(df_male["gestational_week"] <= 25.0)].reset_index(drop=True)
print(f"有效孕周(10-25周)样本数:{len(df_male)}")
if len(df_male) < 20:
raise ValueError(f"有效样本数过少({len(df_male)} < 20),无法进行二分法分组")
# 按孕妇去重
print("\n=== 按孕妇去重 ===")
surv_data = []
for pid, preg_group in df_male.groupby(ACTUAL_ID_COLUMN):
# 按孕周排序,取最新记录的BMI
preg_group_sorted = preg_group.sort_values("gestational_week").reset_index(drop=True)
last_record = preg_group_sorted.iloc[-1] # 最新一次检测记录
# 确定达标时间(首次达标孕周,未达标则取最后一次检测孕周)
qualified_rows = preg_group_sorted[preg_group_sorted["is_qualified"] == 1]
if not qualified_rows.empty:
qualify_time = qualified_rows["gestational_week"].min()
event = 1 # 达标(事件发生)
else:
qualify_time = preg_group_sorted["gestational_week"].max()
event = 0 # 未达标(截尾数据)
# 筛选BMI在合理范围的样本(18-45)
bmi = last_record[ACTUAL_BMI_COLUMN]
if 18.0 <= bmi <= 45.0 and not pd.isna(qualify_time):
surv_data.append({
ACTUAL_ID_COLUMN: pid,
"BMI": bmi,
"time": qualify_time, # 达标时间/截尾时间
"event": event # 是否达标(1=是,0=否)
})
# 生成最终生存分析数据集
surv_df = pd.DataFrame(surv_data).drop_duplicates(ACTUAL_ID_COLUMN).reset_index(drop=True)
print(f"\n=== 数据处理完成 ===")
print(f"有效孕妇样本数:{len(surv_df)}")
print(f"达标孕妇数:{surv_df['event'].sum()}(占比:{surv_df['event'].sum() / len(surv_df) * 100:.1f}%)")
print(f"BMI范围:{surv_df['BMI'].min():.1f} - {surv_df['BMI'].max():.1f}")
print(f"孕周范围:{surv_df['time'].min():.1f} - {surv_df['time'].max():.1f}周")
if len(surv_df) < 20:
raise ValueError(f"最终有效样本数过少({len(surv_df)} < 20),无法进行二分法分组")
return surv_df
if __name__ == "__main__":
EXCEL_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\附件_YNgHYs.xlsx"
PARTITION_PLOT_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\BMI二分法分割过程.png"
SURVIVAL_PLOT_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\BMI分组生存曲线.png"
try:
# 2. 数据预处理:得到生存分析所需的(BMI, time, event)数据
surv_df = process_nipt_data(file_path=EXCEL_PATH)
time = surv_df["time"].values # 达标时间/截尾时间(孕周)
event = surv_df["event"].values # 是否达标(1=是,0=否)
bmi_values = surv_df["BMI"].values # 孕妇BMI值
global_idx = np.arange(len(surv_df)) # 样本全局索引(用于跟踪分组)
# 3. 初始化分析器并执行递归二分法
analyzer = SurvivalAnalyzer()
print(f"\n=== 开始递归二分法分组 ===")
analyzer.recursive_partitioning(
time=time,
event=event,
bmi_values=bmi_values,
global_idx=global_idx,
min_samples=20, # 子组最小样本量(小于2*20则终止)
p_threshold=0.05, # Log-Rank检验显著性阈值
max_depth=3 # 最大递归深度(避免过拟合)
)
# 4. 输出分组结果摘要
print(f"\n=== 分组结果摘要 ===")
print(f"有效分割点数量:{len(analyzer.best_cutpoints)}")
for i, (depth, cut, p) in enumerate(analyzer.best_cutpoints, 1):
print(f" 分割点{i}:深度{depth},BMI={cut:.2f},Log-Rank p={p:.4f}")
print(f"最终分组数:{len(analyzer.groups)}")
for group in analyzer.groups:
print(
f" 组{group['group_id']}:BMI [{group['bmi_range'][0]:.1f}, {group['bmi_range'][1]:.1f}],样本数={len(group['global_idx'])}")
# 5. 确定并输出最佳NIPT检测时点
analyzer.determine_optimal_time(time, event)
# 6. 可视化:先画分割过程,再画生存曲线(含最佳时点标记)
analyzer.plot_partition_process(save_path=PARTITION_PLOT_PATH)
analyzer.plot_survival_curves(time=time, event=event, save_path=SURVIVAL_PLOT_PATH)
except Exception as e:
print(f"\n执行错误:{str(e)}")
将代码中使用绝对路径引用文件的地方全部改为使用文件名