代码和第一个文件发上来了import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")
# 文档中iCOVID关键参数(用于特征处理对齐)
CLINICAL_NUM_FEATURES = ["age", "albumin", "hemoglobin", "total_protein", "ldh"] # 数值型临床特征(参考🔶1 - 41、🔶1 - 94)
CLINICAL_CAT_FEATURES = ["gender", "fever", "cough", "diabetes", "ards", "shock"] # 分类型临床特征(参考🔶1 - 41、🔶1 - 93)
OUTCOME_DEF = {0: "censored", 1: "recovered", 2: "deceased"} # 结局类型定义(对应🔶1 - 51的σₙ)
MAX_RECOVER_DAY = 31 # 恢复时间>30天设为31天,死亡设为32天(参考🔶1 - 60)
# 更新文件路径
file_paths = {
"metainfo": "/mnt/patients_metainfo (TARGET GDC,2025).csv",
"mutation": "/mnt/patients_mutation (TARGET GDC,2025).csv",
"outcomes": "/mnt/patients_outcomes (TARGET GDC,2025).csv",
"mrna": "/mnt/mrna_zcores.xlsx",
"cna": "/mnt/cna.xlsx"
}
# 数据加载与一致性校验(文档核心要求)
# 加载已有的文件
df_metainfo = pd.read_csv(file_paths["metainfo"])
df_mutation = pd.read_csv(file_paths["mutation"])
df_outcomes = pd.read_csv(file_paths["outcomes"])
df_mrna = pd.ExcelFile(file_paths["mrna"]).parse() # 读取 Excel 文件
df_cna = pd.ExcelFile(file_paths["cna"]).parse() # 读取 Excel 文件
# 查看各文件基础信息
print("各文件患者数量与列名:")
for name, df in [("metainfo", df_metainfo), ("mutation", df_mutation), ("outcomes", df_outcomes), ("mrna", df_mrna), ("cna", df_cna)]:
print(f"- {name}文件: 患者数={len(df)}, 列数={len(df.columns)}, 列名={list(df.columns)[:5]}...")
def unify_patient_id(df_list, base_df, id_col="patient_id"):
"""
统一患者ID:以base_df(结局文件)的ID为基准,删除其他文件中不匹配的ID
df_list: 待校验的DataFrame列表,base_df: 基准DataFrame(结局文件)
"""
# 基准ID集合
base_ids = set(base_df[id_col].unique())
print(f"基准结局文件患者数:{len(base_ids)}")
# 校验并过滤每个文件
unified_dfs = []
for df in df_list:
# 检查ID列是否存在
if id_col not in df.columns:
raise ValueError(f"文件缺少{id_col}列,请检查格式!")
# 筛选匹配ID
df_filtered = df[df[id_col].isin(base_ids)].copy()
# 统计删除数量
deleted = len(df) - len(df_filtered)
print(f"筛选后{df.columns[0].split('_')[0]}文件:保留{len(df_filtered)}例,删除{deleted}例(ID不匹配)")
unified_dfs.append(df_filtered)
return unified_dfs + [base_df] # 返回所有筛选后的数据(含基准结局文件)
# 执行ID统一(待校验文件:metainfo、mutation、mrna、cna)
df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes = unify_patient_id(
df_list=[df_metainfo, df_mutation, df_mrna, df_cna],
base_df=df_outcomes
)
# 再次校验所有文件ID数量是否一致
all_ids = [set(df["patient_id"].unique()) for df in [df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes]]
assert len(set.intersection(*all_ids)) == len(all_ids[0]), "ID筛选后仍不匹配,请检查数据!"
print(f"\n最终统一患者数:{len(set.intersection(*all_ids))}")
def stat_missing_values(df_list, df_names):
"""统计每个文件的缺失值比例"""
missing_stats = []
for df, name in zip(df_list, df_names):
# 计算每列缺失率
missing_rate = (df.isnull().sum() / len(df) * 100).round(2)
# 筛选缺失率>0的列
missing_cols = missing_rate[missing_rate > 0]
if len(missing_cols) > 0:
missing_stats.append({
"file": name,
"missing_cols": dict(missing_cols),
"max_missing_rate": missing_cols.max(),
"min_missing_rate": missing_cols.min()
})
else:
missing_stats.append({
"file": name,
"missing_cols": "无",
"max_missing_rate": 0,
"min_missing_rate": 0
})
# 转为DataFrame展示
df_missing = pd.DataFrame(missing_stats)
return df_missing
# 统计缺失值
df_missing = stat_missing_values(
df_list=[df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes],
df_names=["metainfo", "mutation", "mrna", "cna", "outcomes"]
)
print("各文件缺失值统计:")
print(df_missing.to_string(index=False))
# 分文件特征预处理(严格对齐文档逻辑)
def preprocess_metainfo(df_metainfo, id_col="patient_id"):
"""
处理临床基础信息:分类特征二值化,数值特征归一化
分类特征(如gender、fever):0=无/女,1=有/男;数值特征(age):归一化到[0,1]
"""
df = df_metainfo.copy()
# 1. 分类特征二值化(参考文档中症状/合并症的编码逻辑🔶1 - 41)
cat_features = ["gender", "fever", "cough", "expectoration", "diabetes", "ards", "shock"]
for feat in cat_features:
if feat in df.columns:
# 缺失值填0(无),非缺失值映射为0/1
df[feat] = df[feat].fillna(0)
# 若原始值为字符串(如"男"/"女"),转换为0/1
if df[feat].dtype == "object":
df[feat] = df[feat].map({"女": 0, "男": 1, "无": 0, "有": 1}).fillna(0)
df[feat] = df[feat].astype(int)
# 2. 数值特征归一化(年龄,参考文档🔶1 - 136的Min - Max归一化)
num_features = ["age"]
scaler = MinMaxScaler(feature_range=(0, 1))
for feat in num_features:
if feat in df.columns:
# 缺失值填均值
df[feat] = df[feat].fillna(df[feat].mean())
# 归一化
df[feat] = scaler.fit_transform(df[[feat]]).flatten()
# 3. 保留ID和处理后的特征
keep_cols = [id_col] + cat_features + num_features
keep_cols = [col for col in keep_cols if col in df.columns]
df_processed = df[keep_cols]
return df_processed, scaler
# 执行处理
df_metainfo_processed, age_scaler = preprocess_metainfo(df_metainfo)
print(f"\nmetainfo处理后:列数={len(df_metainfo_processed.columns)},示例:")
print(df_metainfo_processed.head())
def preprocess_mutation(df_mutation, id_col="patient_id"):
"""
处理基因突变特征:二值化(0=无突变,1=有突变),缺失值填0
"""
df = df_mutation.copy()
# 所有基因列(排除ID列)
gene_cols = [col for col in df.columns if col != id_col]
# 1. 缺失值填0(默认无突变)
df[gene_cols] = df[gene_cols].fillna(0)
# 2. 二值化(确保所有值为0/1,如原始为"突变"/"野生型"则映射)
for col in gene_cols:
if df[col].dtype == "object":
df[col] = df[col].map({"野生型": 0, "突变": 1}).fillna(0)
# 若为数值(如突变频率),>0视为1(有突变)
else:
df[col] = (df[col] > 0).astype(int)
df_processed = df[[id_col] + gene_cols]
return df_processed
# 执行处理
df_mutation_processed = preprocess_mutation(df_mutation)
print(f"\nmutation处理后:基因特征数={len(df_mutation_processed.columns)-1},示例:")
print(df_mutation_processed.head())
def preprocess_mrna(df_mrna, id_col="patient_id"):
"""
处理mRNA表达特征:z值已标准化,仅填充缺失值(填0,即与均值一致),转换为CSV格式
"""
df = df_mrna.copy()
# 1. 确保ID列存在且命名统一
if "sample_id" in df.columns and id_col not in df.columns:
df.rename(columns={"sample_id": id_col}, inplace=True)
# 2. 基因列(排除ID列)
gene_cols = [col for col in df.columns if col != id_col]
# 3. 缺失值填0(z值为0表示与均值一致,符合文档缺失值处理逻辑🔶1 - 136)
df[gene_cols] = df[gene_cols].fillna(0)
df_processed = df[[id_col] + gene_cols]
return df_processed
# 执行处理
df_mrna_processed = preprocess_mrna(df_mrna)
print(f"\nmrna处理后:基因特征数={len(df_mrna_processed.columns)-1},示例:")
print(df_mrna_processed.head())
def preprocess_cna(df_cna, id_col="patient_id"):
"""
处理拷贝数变异特征:编码为-1(缺失)、0(正常)、1(扩增),缺失值填0
"""
df = df_cna.copy()
gene_cols = [col for col in df.columns if col != id_col]
# 1. 缺失值填0(默认正常)
df[gene_cols] = df[gene_cols].fillna(0)
# 2. 编码(原始值如"缺失"/"正常"/"扩增"映射为-1/0/1)
for col in gene_cols:
if df[col].dtype == "object":
df[col] = df[col].map({"缺失": -1, "正常": 0, "扩增": 1}).fillna(0)
# 若为数值(如拷贝数),<0=-1,=0=0,>0=1
else:
df[col] = np.where(df[col] < 0, -1, np.where(df[col] > 0, 1, 0))
df_processed = df[[id_col] + gene_cols]
return df_processed
# 执行处理
df_cna_processed = preprocess_cna(df_cna)
print(f"\ncna处理后:基因特征数={len(df_cna_processed.columns)-1},示例:")
print(df_cna_processed.head())
def preprocess_outcomes(df_outcomes, id_col="patient_id", max_recover_day=MAX_RECOVER_DAY):
"""
处理结局变量:定义σₙ(0=截尾,1=恢复,2=死亡)和tₙ(结局时间)
恢复时间>30天设为31,死亡设为32(参考文档🔶1 - 60)
"""
df = df_outcomes.copy()
# 1. 结局类型编码(σₙ)
if "outcome_type" in df.columns:
# 若原始为字符串,映射为0/1/2
if df["outcome_type"].dtype == "object":
df["outcome_type"] = df["outcome_type"].map({
"censored": 0, "lost_to_follow_up": 0, # 截尾(失访)
"recovered": 1, "discharged": 1, # 恢复(出院)
"deceased": 2, "death": 2 # 死亡
}).fillna(0) # 缺失值视为截尾
else:
# 若无outcome_type,按结局时间推断(参考文档🔶1 - 51)
df["outcome_type"] = np.where(
df["outcome_days"].isnull(), 0, # 无时间→截尾
np.where(df["outcome_days"] <= 30, 1, 2) # ≤30天→恢复,>30→死亡(暂设)
)
# 2. 结局时间处理(tₙ)
if "outcome_days" in df.columns:
# 缺失值填7(截尾默认时间,参考文档🔶1 - 30)
df["outcome_days"] = df["outcome_days"].fillna(7).astype(int)
# 恢复患者:>30天设为31
df.loc[(df["outcome_type"] == 1) & (df["outcome_days"] > 30), "outcome_days"] = max_recover_day
# 死亡患者:统一设为32
df.loc[df["outcome_type"] == 2, "outcome_days"] = 32
# 截尾患者:时间≤10天(参考文档🔶1 - 30)
df.loc[df["outcome_type"] == 0, "outcome_days"] = np.clip(df.loc[df["outcome_type"] == 0, "outcome_days"], 3, 10)
# 保留关键列
df_processed = df[[id_col, "outcome_type", "outcome_days"]]
return df_processed
# 执行处理
df_outcomes_processed = preprocess_outcomes(df_outcomes)
print(f"\noutcomes处理后:结局分布={df_outcomes_processed['outcome_type'].value_counts().to_dict()},示例:")
print(df_outcomes_processed.head())
# 多表整合为患者级多模态矩阵
def integrate_multimodal_data(df_list, id_col="patient_id"):
"""
多表整合:以patient_id为键,左连接所有处理后的DataFrame
df_list: 处理后的DataFrame列表(含metainfo、mutation、cna、mrna、blood_cell、outcomes)
"""
# 以第一个DataFrame为基础,逐步连接其他DataFrame
integrated_df = df_list[0].copy()
for df in df_list[1:]:
integrated_df = integrated_df.merge(df, on=id_col, how="left")
# 检查连接后是否有新增缺失值(理论上不应有,因ID已统一)
new_missing = integrated_df.isnull().sum().sum() - integrated_df[id_col].isnull().sum()
if new_missing > 0:
print(f"连接{df.columns[1].split('_')[0]}后新增{new_missing}个缺失值,已填充为0")
integrated_df = integrated_df.fillna(0)
# 确保无重复行
integrated_df = integrated_df.drop_duplicates(subset=[id_col])
print(f"\n整合后多模态矩阵:患者数={len(integrated_df)},特征数={len(integrated_df.columns)-3}(不含ID、结局类型、结局时间)")
return integrated_df
# 执行整合(顺序:metainfo→mutation→mrna→cna→outcomes)
df_integrated = integrate_multimodal_data([
df_metainfo_processed,
df_mutation_processed,
df_mrna_processed,
df_cna_processed,
df_outcomes_processed
])
# 查看整合结果
print("\n整合后数据示例(前5列+最后3列):")
print(df_integrated.iloc[:, list(range(5)) + list(range(-3, 0))].head())
# 数据划分与输出(贴合文档验证策略)
def split_and_save_data(df_integrated, id_col="patient_id", save_path="./processed_data/"):
"""
数据划分:五折交叉验证划分训练/验证集,保存处理后数据(CSV格式)
参考文档🔶1 - 26的五折交叉验证逻辑
"""
import os
os.makedirs(save_path, exist_ok=True)
# 1. 保存完整整合数据
df_integrated.to_csv(os.path.join(save_path, "integrated_patients_data.csv"), index=False)
print(f"完整数据已保存至:{os.path.join(save_path, 'integrated_patients_data.csv')}")
# 2. 五折交叉验证划分
kf = KFold(n_splits=5, shuffle=True, random_state=42)
patients = df_integrated[id_col].unique()
fold = 1
for train_idx, val_idx in kf.split(patients):
# 划分训练/验证ID
train_ids = patients[train_idx]
val_ids = patients[val_idx]
# 生成训练/验证集
df_train = df_integrated[df_integrated[id_col].isin(train_ids)]
df_val = df_integrated[df_integrated[id_col].isin(val_ids)]
# 保存
df_train.to_csv(os.path.join(save_path, f"train_fold{fold}.csv"), index=False)
df_val.to_csv(os.path.join(save_path, f"val_fold{fold}.csv"), index=False)
print(f"Fold{fold}:训练集{len(df_train)}例,验证集{len(df_val)}例,已保存")
fold += 1
# 3. 保存特征说明(用于后续模型解释,参考文档FSR机制🔶1 - 61)
feature_info = pd.DataFrame({
"feature_name": [col for col in df_integrated.columns if col not in [id_col, "outcome_type", "outcome_days"]],
"feature_type": ["clinical" if col in df_metainfo_processed.columns else
"mutation" if col in df_mutation_processed.columns else
"mrna" if col in df_mrna_processed.columns else
"cna" if col in df_cna_processed.columns else
"outcome" if col in df_outcomes_processed.columns else ""],
"processing_method": ["binary" if col in df_mutation_processed.columns or col in CLINICAL_CAT_FEATURES else
"normalized" if col in df_metainfo_processed.columns else
"z_score" if col in df_mrna_processed.columns else
"ternary" if col in df_cna_processed.columns else
"outcome_encoding" if col in df_outcomes_processed.columns else ""]
})
feature_info.to_csv(os.path.join(save_path, "feature_metadata.csv"), index=False)
print(f"特征说明已保存至:{os.path.join(save_path, 'feature_metadata.csv')}")
return df_train, df_val
# 执行划分与保存
df_train, df_val = split_and_save_data(df_integrated)
# 关键结果可视化(验证处理效果)
# 1. 结局分布可视化(参考文档🔶1 - 30)
plt.figure(figsize=(12, 4))
# 结局类型分布
plt.subplot(1, 2, 1)
outcome_counts = df_integrated["outcome_type"].value_counts().sort_index()
outcome_labels = [OUTCOME_DEF[i] for i in outcome_counts.index]
plt.bar(outcome_labels, outcome_counts.values, color=["lightblue", "lightgreen", "salmon"])
plt.title("患者结局类型分布(参考文档🔶1 - 30)")
plt.ylabel("患者数")
# 恢复时间分布(仅恢复患者)
plt.subplot(1, 2, 2)
recover_days = df_integrated[df_integrated["outcome_type"] == 1]["outcome_days"]
plt.hist(recover_days, bins=10, color="lightgreen", edgecolor="black")
plt.title("恢复患者时间分布(≤31天,参考文档🔶1 - 60)")
plt.xlabel("恢复时间(天)")
plt.ylabel("患者数")
plt.tight_layout()
plt.savefig("./processed_data/outcome_distribution.png", dpi=300, bbox_inches="tight")
plt.close()
print("\n结局分布图表已保存至:./processed_data/outcome_distribution.png")
# 2. 关键生物标志物相关性(参考文档🔶1 - 94的Pearson分析)
# 由于当前没有生物标志物数据,此部分代码先注释掉
# markers = ["albumin", "hemoglobin", "total_protein", "ldh"]
# markers = [m for m in markers if m in df_integrated.columns]
# recover_days = df_integrated[df_integrated["outcome_type"] == 1]["outcome_days"]
# corr_results = []
# for marker in markers:
# marker_vals = df_integrated[df_integrated["outcome_type"] == 1][marker]
# corr, p_val = pearsonr(marker_vals, recover_days)
# corr_results.append({"marker": marker, "pearson_corr": corr.round(3), "p_value": p_val.round(4)})
# df_corr = pd.DataFrame(corr_results)
# print("\n关键生物标志物与恢复时间的Pearson相关性(参考文档🔶1 - 94):")
# print(df_corr.to_string(index=False))
最新发布