week_15_ Combinations

Description

Given two integers n and k, return all possible combinations of k numbers out of 1 ... n.

For example,
If n = 4 and k = 2, a solution is:

[
  [2,4],
  [3,4],
  [2,3],
  [1,2],
  [1,3],
  [1,4],
]

Solution

n个数选取k个排列组合,只需要进行k层递归即可,其思路如下:

① 从begin开始,对begin到n进行遍历,选取新元素i,加入组合result。

② 对result的当前大小与k进行比较,如果相等,则完成组合,将result加入最终的list。

③ 若不相等,则以新元素的下一元素为begin,进入下一层递归。

④ 将新元素从result中弹出,以对后面的元素继续进行遍历。

代码实现如下:

// 用递归进行解决
class Solution {
public:
	vector<vector<int>> combine(int n, int k) {
		vector<vector<int>> resultList;
		recursCombine(n, k, 1, resultList);
		return resultList;
	}

	void recursCombine(int n, int k, int begin, vector<vector<int>>& list) {
		for (int i = begin; i <= n; i++) {
			int new_element = i;
			result.push_back(new_element);
			if (result.size() == k)
				list.push_back(result);
			else
				recursCombine(n, k, i + 1, list);
			result.pop_back();
		}
	}
private:
	vector<int> result;
};

运行结果如下:

import numpy as np import pandas as pd import matplotlib.pyplot as plt from lifelines import KaplanMeierFitter from lifelines.statistics import logrank_test from itertools import combinations #设置格式便于展 plt.rcParams["font.size"] = 10 class SurvivalAnalyzer: def __init__(self): self.best_cutpoints = [] # 最终有效分割点(深度、分割值、P值) self.groups = [] # 最终分组的样本索引 self.step_records = [] # 记录每一步分割过程的详细信息 self.group_id = 0 # 用于标记最终分组的唯一ID self.optimal_times = {} # 存储每个分组的最佳检测时点 def log_rank_test(self, time, event, groups): """Log-Rank检验(多组比较时校正P值)""" unique_groups = np.unique(groups) if len(unique_groups) <= 1: return 1.0 p_values = [] for g1, g2 in combinations(unique_groups, 2): idx1 = groups == g1 idx2 = groups == g2 result = logrank_test(time[idx1], time[idx2], event[idx1], event[idx2]) p_values.append(result.p_value) corrected_p = min(np.min(p_values) * len(p_values), 1.0) # Bonferroni校正 return corrected_p def find_best_cutpoint(self, time, event, bmi_values, min_samples=20): """寻找最佳分割点(遍历所有可能分割点,选P值最小的)""" sorted_idx = np.argsort(bmi_values) sorted_bmi = bmi_values[sorted_idx] sorted_time = time[sorted_idx] sorted_event = event[sorted_idx] n = len(sorted_bmi) best_p, best_cut = 1.0, None # 遍历所有满足最小样本量的分割点 for i in range(min_samples, n - min_samples): cut = sorted_bmi[i] g1_idx = sorted_idx[:i] # 左子组(BMI < cut) g2_idx = sorted_idx[i:] # 右子组(BMI >= cut) if len(g1_idx) < min_samples or len(g2_idx) < min_samples: continue # 计算当前分割点的Log-Rank P值 result = logrank_test(time[g1_idx], time[g2_idx], event[g1_idx], event[g2_idx]) if result.p_value < best_p: best_p = result.p_value best_cut = cut return best_cut, best_p def recursive_partitioning(self, time, event, bmi_values, global_idx, current_depth=0, min_samples=20, p_threshold=0.05, max_depth=3): """改进的递归二分法:记录过程+全局索引跟踪""" # 当前子组的基本信息 current_bmi = bmi_values[global_idx] # 当前子组的BMI值 current_size = len(global_idx) # 当前子组样本量 current_bmi_sorted = np.sort(current_bmi) # 排序后的BMI(用于可视化) # 记录:当前子组的初始状态 step_info = { "depth": current_depth, "subgroup_id": f"D{current_depth}_G{len(self.step_records) + 1}", # 子组唯一标识(深度_序号) "sample_size": current_size, "bmi_range": (current_bmi.min(), current_bmi.max()), "bmi_sorted": current_bmi_sorted, "best_cut": None, "logrank_p": None, "is_partitioned": False, "left_subgroup": None, # 左子组(BMI <= cut)的全局索引 "right_subgroup": None, # 右子组(BMI > cut)的全局索引 "stop_reason": None # 终止分割的原因 } # 检查终止条件:样本量不足/深度超限/P值不显著 if current_size < 2 * min_samples: step_info["stop_reason"] = f"样本量不足({current_size} < {2 * min_samples})" self.step_records.append(step_info) self.groups.append( {"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]}) self.group_id += 1 return if current_depth >= max_depth: step_info["stop_reason"] = f"达到最大深度({current_depth} >= {max_depth})" self.step_records.append(step_info) self.groups.append( {"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]}) self.group_id += 1 return # 寻找当前子组的最佳分割点 best_cut, best_p = self.find_best_cutpoint( time=time[global_idx], event=event[global_idx], bmi_values=current_bmi, min_samples=min_samples ) # 检查分割点有效性 if best_cut is None or best_p >= p_threshold: step_info["logrank_p"] = best_p if best_p is not None else "N/A" step_info["stop_reason"] = f"Log-Rank P值不显著(p={step_info['logrank_p']:.4f} >= {p_threshold})" self.step_records.append(step_info) self.groups.append( {"group_id": self.group_id, "global_idx": global_idx, "bmi_range": step_info["bmi_range"]}) self.group_id += 1 return # 分割有效:记录分割信息并递归处理子组 step_info["best_cut"] = best_cut step_info["logrank_p"] = best_p step_info["is_partitioned"] = True self.best_cutpoints.append((current_depth, best_cut, best_p)) # 加入最终有效分割点 # 划分左、右子组(基于全局索引) left_mask = current_bmi <= best_cut right_mask = current_bmi > best_cut left_global_idx = global_idx[left_mask] right_global_idx = global_idx[right_mask] step_info["left_subgroup"] = {"size": len(left_global_idx), "bmi_range": (current_bmi[left_mask].min(), current_bmi[left_mask].max())} step_info["right_subgroup"] = {"size": len(right_global_idx), "bmi_range": (current_bmi[right_mask].min(), current_bmi[right_mask].max())} self.step_records.append(step_info) # 递归处理左、右子组 self.recursive_partitioning(time, event, bmi_values, left_global_idx, current_depth + 1, min_samples, p_threshold, max_depth) self.recursive_partitioning(time, event, bmi_values, right_global_idx, current_depth + 1, min_samples, p_threshold, max_depth) def determine_optimal_time(self, time, event): """ 确定每个分组的最佳NIPT检测时点 核心规则:在Kaplan-Meier生存曲线上,找到生存概率S(t)=0.1对应的最小孕周t* 若t*≤12周则取t*,若t^*>12周则取"S(t)≤0.1且最接近12周"的时点 """ print("\n2.2 最佳NIPT时点确定模型(优化类)") print("核心目标:在'高检测成功率(准确性)'与'早期干预(低风险)'之间寻求最优平衡。") print("时点选择准则(平衡准确性与风险)") print("临床目标:在'检测准确性(达标率≥90%)'与'早期检测(降低治疗风险)'间找最优解,依据文档3的风险阶段:") print("\t早期(≤12 周):风险低;") print("\t中期(13-27 周):风险高;") print("\t晚期(≥28 周):风险极高。") print( "核心规则:在每组Kaplan-Meier生存曲线上,选取生存概率S(t)=0.1对应的最小孕周t^*(即仅10%孕妇未达标,90%已达标),") print(" 若t^*≤12周则取t^*,若t^*>12周则取'S(t)≤0.1且最接近12周'的时点。\n") # 按BMI范围升序排列分组 sorted_groups = sorted(self.groups, key=lambda x: x["bmi_range"][0]) # 为每个分组确定最佳检测时点 for group in sorted_groups: group_id = group["group_id"] group_idx = group["global_idx"] bmi_range = group["bmi_range"] # 拟合Kaplan-Meier曲线 kmf = KaplanMeierFitter() kmf.fit( durations=time[group_idx], event_observed=event[group_idx], alpha=0.05 ) # 获取生存曲线数据 survival_function = kmf.survival_function_ times = survival_function.index.values survival_probs = survival_function.iloc[:, 0].values # 找到生存概率S(t)=0.1对应的最小孕周t* # 首先找到所有生存概率≤0.1的时间点 threshold = 0.1 valid_indices = np.where(survival_probs <= threshold)[0] if len(valid_indices) == 0: # 如果没有达标率≥90%的时间点,取最后一个时间点 optimal_time = times[-1] reason = "未达到90%达标率,取最后一个观测时间点" else: # 找到达标率首次达到90%的时间点 t_star = times[valid_indices[0]] # 应用核心规则 if t_star <= 12: optimal_time = t_star reason = f"t*={t_star:.1f}≤12周,直接选取" else: # 找到≤12周且最接近12周的时间点,且该时间点达标率≥90% before_12_indices = np.where((times <= 12) & (survival_probs <= threshold))[0] if len(before_12_indices) > 0: optimal_time = times[before_12_indices[-1]] reason = f"t*={t_star:.1f}>12周,选取≤12周且最接近12周的达标时点" else: # 如果12周前没有达标率≥90%的时间点,取首次达标时间点 optimal_time = t_star reason = f"12周前未达到90%达标率,选取首次达标时间点t*={t_star:.1f}" # 记录最佳时点及其达标率 # 找到最佳时点对应的生存概率 closest_idx = np.argmin(np.abs(times - optimal_time)) survival_prob_at_optimal = survival_probs[closest_idx] success_rate = 1 - survival_prob_at_optimal self.optimal_times[group_id] = { "bmi_range": bmi_range, "optimal_time": optimal_time, "success_rate": success_rate, "reason": reason, "t_star": t_star if 't_star' in locals() else None } # 输出结果 print(f"组{group_id}(BMI [{bmi_range[0]:.1f}, {bmi_range[1]:.1f}]):") print(f" 最佳检测时点: {optimal_time:.1f}周") print(f" 此时点达标率: {success_rate:.2%}") print(f" 选择原因: {reason}\n") def plot_partition_process(self, save_path=None): """可视化二分法分割全过程""" if not self.step_records: print("无分割过程记录,无法可视化") return # 按深度分组,统计每一层的子组数量 depth_groups = {} for step in self.step_records: depth = step["depth"] if depth not in depth_groups: depth_groups[depth] = [] depth_groups[depth].append(step) max_depth = max(depth_groups.keys()) if depth_groups else 0 max_subgroups_per_depth = max(len(subgroups) for subgroups in depth_groups.values()) if depth_groups else 1 # 设置画布大小(根据深度和子组数量动态调整) fig, axes = plt.subplots( nrows=max_depth + 1, ncols=max_subgroups_per_depth, figsize=(4 * max_subgroups_per_depth, 3 * (max_depth + 1)), squeeze=False # 确保即使只有1个子图也保持2D数组结构 ) # 遍历每一层、每个子组,绘制分割详情 for depth in sorted(depth_groups.keys()): subgroups = depth_groups[depth] for idx, step in enumerate(subgroups): ax = axes[depth, idx] # 1. 绘制当前子组的BMI分布直方图 bmi_sorted = step["bmi_sorted"] ax.hist(bmi_sorted, bins=15, alpha=0.6, color="#87CEEB", edgecolor="black", linewidth=0.5, label=f"样本数:{step['sample_size']}") # 2. 若有有效分割点,绘制竖线标记 if step["is_partitioned"]: ax.axvline(x=step["best_cut"], color="red", linewidth=2, linestyle="--", label=f"分割点:{step['best_cut']:.2f}") # 标注Log-Rank P值 ax.text(0.02, 0.95, f"Log-Rank p={step['logrank_p']:.4f}", transform=ax.transAxes, verticalalignment="top", bbox=dict(boxstyle="round", facecolor="white", alpha=0.8), fontsize=9) # 3. 标注子组信息和终止原因 title = f"{step['subgroup_id']} | BMI: [{step['bmi_range'][0]:.1f}, {step['bmi_range'][1]:.1f}]" if step["stop_reason"]: title += f"\n终止:{step['stop_reason']}" ax.set_title(title, fontsize=10, pad=10) # 4. 设置坐标轴 ax.set_xlabel("BMI值", fontsize=9) ax.set_ylabel("样本频数", fontsize=9) ax.legend(fontsize=8, loc="upper right") ax.grid(alpha=0.3, linestyle=":") # 5. 隐藏空的子图(若某一层子组数量少于最大数量) if idx >= len(subgroups) and max_subgroups_per_depth > len(subgroups): ax.axis("off") # 调整子图间距,避免重叠 plt.tight_layout(pad=2.0) # 保存或显示图片 if save_path: plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"分割过程图已保存至:{save_path}") plt.show() def plot_survival_curves(self, time, event, save_path=None): """基于最终分组绘制生存曲线,并标记最佳检测时点""" if not self.groups: print("无最终分组,无法绘制生存曲线") return kmf = KaplanMeierFitter() fig, ax = plt.subplots(figsize=(12, 6)) # 按BMI范围升序排列分组(确保曲线顺序合理) sorted_groups = sorted(self.groups, key=lambda x: x["bmi_range"][0]) # 绘制每个分组的生存曲线 colors = plt.cm.Set3(np.linspace(0, 1, len(sorted_groups))) # 配色方案 for i, group in enumerate(sorted_groups): group_id = group["group_id"] group_idx = group["global_idx"] bmi_range = group["bmi_range"] group_label = f"组{group_id}:BMI [{bmi_range[0]:.1f}, {bmi_range[1]:.1f}](n={len(group_idx)})" # 拟合Kaplan-Meier曲线 kmf.fit( durations=time[group_idx], event_observed=event[group_idx], label=group_label, alpha=0.05 ) kmf.plot_survival_function(ax=ax, ci_show=True, linewidth=2, color=colors[i], ci_alpha=0.2) # 标记最佳检测时点 if group_id in self.optimal_times: optimal_time = self.optimal_times[group_id]["optimal_time"] ax.axvline(x=optimal_time, color=colors[i], linestyle=":", linewidth=2) ax.scatter(optimal_time, 0.1, color=colors[i], s=100, zorder=5) ax.text(optimal_time + 0.5, 0.12, f"最佳时点: {optimal_time:.1f}周", color=colors[i], fontsize=9) # 标记12周界限和90%达标率线 ax.axvline(x=12, color='gray', linestyle='-', alpha=0.5, label='12周界限') ax.axhline(y=0.1, color='black', linestyle='--', alpha=0.5, label='90%达标率线') # 设置图表样式 ax.set_title("不同BMI组男胎Y染色体达标生存曲线(含最佳检测时点)", fontsize=14, pad=15) ax.set_xlabel("检测孕周(周)", fontsize=12) ax.set_ylabel("未达标率(生存概率)", fontsize=12) ax.legend(title="BMI分组", loc="upper right", fontsize=10, title_fontsize=11) ax.grid(alpha=0.3, linestyle=":") ax.set_xlim(10, 25) # 限制孕周范围(与数据筛选一致) ax.set_ylim(0, 1) # 生存概率范围 # 保存或显示图片 plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches="tight") print(f"生存曲线图已保存至:{save_path}") plt.show() def process_nipt_data(file_path): """数据预处理函数(删除原始样本数、Y列信息、男胎筛选的打印语句)""" # 读取数据 try: df_raw = pd.read_excel(file_path, sheet_name="男胎检测数据") except ValueError: df_raw = pd.read_excel(file_path, sheet_name=0) # 手动指定列名(根据实际数据调整) ACTUAL_WEEK_COLUMN = "检测孕周" ACTUAL_BMI_COLUMN = "孕妇BMI" ACTUAL_Y_COLUMN = "Y染色体浓度" ACTUAL_ID_COLUMN = "孕妇代码" GENDER_COLUMN = "胎儿性别" # 检查必需列 required_cols = [ACTUAL_WEEK_COLUMN, ACTUAL_BMI_COLUMN, ACTUAL_Y_COLUMN, ACTUAL_ID_COLUMN] for col in required_cols: if col not in df_raw.columns: raise KeyError(f"未找到列:{col},请检查列名是否正确") # 转换Y染色体浓度列为数值类型 try: df_raw[ACTUAL_Y_COLUMN] = pd.to_numeric(df_raw[ACTUAL_Y_COLUMN], errors='coerce') except Exception as e: print(f"转换数值类型时出错:{e}") # 筛选男胎样本 if GENDER_COLUMN in df_raw.columns: df_male = df_raw[df_raw[GENDER_COLUMN].str.contains('男', na=False)].reset_index(drop=True) else: df_male = df_raw[df_raw[ACTUAL_Y_COLUMN].notna()].reset_index(drop=True) if len(df_male) == 0: raise ValueError("筛选后男胎样本数为0,无法继续分析") # 阈值自动检测(Y染色体浓度达标标准) print("\n=== 阈值自动检测(Y染色体浓度达标标准)===") thresholds = [0.01, 0.04, 1.0, 4.0, 10.0] # 常见阈值候选 threshold_results = [] for threshold in thresholds: count = (df_male[ACTUAL_Y_COLUMN] >= threshold).sum() percentage = count / len(df_male) * 100 threshold_results.append((threshold, count, percentage)) print(f"阈值={threshold}:达标数={count}({percentage:.1f}%)") # 选择最佳阈值(优先10%-90%达标率,否则选最接近50%的) valid_thresholds = [(t, c, p) for t, c, p in threshold_results if 10 <= p <= 90] if valid_thresholds: best_threshold = min(valid_thresholds, key=lambda x: abs(x[2] - 50)) else: best_threshold = max(threshold_results, key=lambda x: x[1]) # fallback:选达标数最多的 print(f"自动选择最佳阈值:{best_threshold[0]}(达标数={best_threshold[1]},占比={best_threshold[2]:.1f}%)") df_male["is_qualified"] = (df_male[ACTUAL_Y_COLUMN] >= best_threshold[0]).astype(int) # 孕周处理 print("\n=== 孕周处理 ===") def convert_gestational_week(week_str): """将"Xw+Yd"格式转换为小数孕周""" if pd.isna(week_str) or "w" not in str(week_str): return np.nan week_part, day_part = str(week_str).split("w", 1) weeks = float(week_part.strip()) days = float(day_part.strip().split("+")[1]) if "+" in day_part else 0.0 return round(weeks + days / 7, 2) df_male["gestational_week"] = df_male[ACTUAL_WEEK_COLUMN].apply(convert_gestational_week) print(f"孕周转换后缺失值数量:{df_male['gestational_week'].isna().sum()}") # 筛选10-25周的有效样本 df_male = df_male[(df_male["gestational_week"] >= 10.0) & (df_male["gestational_week"] <= 25.0)].reset_index(drop=True) print(f"有效孕周(10-25周)样本数:{len(df_male)}") if len(df_male) < 20: raise ValueError(f"有效样本数过少({len(df_male)} < 20),无法进行二分法分组") # 按孕妇去重 print("\n=== 按孕妇去重 ===") surv_data = [] for pid, preg_group in df_male.groupby(ACTUAL_ID_COLUMN): # 按孕周排序,取最新记录的BMI preg_group_sorted = preg_group.sort_values("gestational_week").reset_index(drop=True) last_record = preg_group_sorted.iloc[-1] # 最新一次检测记录 # 确定达标时间(首次达标孕周,未达标则取最后一次检测孕周) qualified_rows = preg_group_sorted[preg_group_sorted["is_qualified"] == 1] if not qualified_rows.empty: qualify_time = qualified_rows["gestational_week"].min() event = 1 # 达标(事件发生) else: qualify_time = preg_group_sorted["gestational_week"].max() event = 0 # 未达标(截尾数据) # 筛选BMI在合理范围的样本(18-45) bmi = last_record[ACTUAL_BMI_COLUMN] if 18.0 <= bmi <= 45.0 and not pd.isna(qualify_time): surv_data.append({ ACTUAL_ID_COLUMN: pid, "BMI": bmi, "time": qualify_time, # 达标时间/截尾时间 "event": event # 是否达标(1=是,0=否) }) # 生成最终生存分析数据集 surv_df = pd.DataFrame(surv_data).drop_duplicates(ACTUAL_ID_COLUMN).reset_index(drop=True) print(f"\n=== 数据处理完成 ===") print(f"有效孕妇样本数:{len(surv_df)}") print(f"达标孕妇数:{surv_df['event'].sum()}(占比:{surv_df['event'].sum() / len(surv_df) * 100:.1f}%)") print(f"BMI范围:{surv_df['BMI'].min():.1f} - {surv_df['BMI'].max():.1f}") print(f"孕周范围:{surv_df['time'].min():.1f} - {surv_df['time'].max():.1f}周") if len(surv_df) < 20: raise ValueError(f"最终有效样本数过少({len(surv_df)} < 20),无法进行二分法分组") return surv_df if __name__ == "__main__": EXCEL_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\附件_YNgHYs.xlsx" PARTITION_PLOT_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\BMI二分法分割过程.png" SURVIVAL_PLOT_PATH = r"C:\Users\30582\PycharmProjects\PythonProject\来来来\BMI分组生存曲线.png" try: # 2. 数据预处理:得到生存分析所需的(BMI, time, event)数据 surv_df = process_nipt_data(file_path=EXCEL_PATH) time = surv_df["time"].values # 达标时间/截尾时间(孕周) event = surv_df["event"].values # 是否达标(1=是,0=否) bmi_values = surv_df["BMI"].values # 孕妇BMI值 global_idx = np.arange(len(surv_df)) # 样本全局索引(用于跟踪分组) # 3. 初始化分析器并执行递归二分法 analyzer = SurvivalAnalyzer() print(f"\n=== 开始递归二分法分组 ===") analyzer.recursive_partitioning( time=time, event=event, bmi_values=bmi_values, global_idx=global_idx, min_samples=20, # 子组最小样本量(小于2*20则终止) p_threshold=0.05, # Log-Rank检验显著性阈值 max_depth=3 # 最大递归深度(避免过拟合) ) # 4. 输出分组结果摘要 print(f"\n=== 分组结果摘要 ===") print(f"有效分割点数量:{len(analyzer.best_cutpoints)}") for i, (depth, cut, p) in enumerate(analyzer.best_cutpoints, 1): print(f" 分割点{i}:深度{depth},BMI={cut:.2f},Log-Rank p={p:.4f}") print(f"最终分组数:{len(analyzer.groups)}") for group in analyzer.groups: print( f" 组{group['group_id']}:BMI [{group['bmi_range'][0]:.1f}, {group['bmi_range'][1]:.1f}],样本数={len(group['global_idx'])}") # 5. 确定并输出最佳NIPT检测时点 analyzer.determine_optimal_time(time, event) # 6. 可视化:先画分割过程,再画生存曲线(含最佳时点标记) analyzer.plot_partition_process(save_path=PARTITION_PLOT_PATH) analyzer.plot_survival_curves(time=time, event=event, save_path=SURVIVAL_PLOT_PATH) except Exception as e: print(f"\n执行错误:{str(e)}") 将代码中使用绝对路径引用文件的地方全部改为使用文件名
09-07
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值