df

部署运行你感兴趣的模型镜像
/// BMFontUnicode
BMFontUnicode::BMFontUnicode()
{
    CCDictionary *strings = CCDictionary::dictionaryWithContentsOfFile("fonts/strings.xml");
    const char *chinese = ((CCString*)strings->objectForKey("chinese1"))->m_sString.c_str();
    const char *japanese = ((CCString*)strings->objectForKey("japanese"))->m_sString.c_str();
    const char *spanish = ((CCString*)strings->objectForKey("spanish"))->m_sString.c_str();

关键代码
引擎只支持utf-8编码

您可能感兴趣的与本文相关的镜像

Stable-Diffusion-3.5

Stable-Diffusion-3.5

图片生成
Stable-Diffusion

Stable Diffusion 3.5 (SD 3.5) 是由 Stability AI 推出的新一代文本到图像生成模型,相比 3.0 版本,它提升了图像质量、运行速度和硬件效率

代码和第一个文件发上来了import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import KFold import warnings warnings.filterwarnings("ignore") # 文档中iCOVID关键参数(用于特征处理对齐) CLINICAL_NUM_FEATURES = ["age", "albumin", "hemoglobin", "total_protein", "ldh"] # 数值型临床特征(参考🔶1 - 41、🔶1 - 94) CLINICAL_CAT_FEATURES = ["gender", "fever", "cough", "diabetes", "ards", "shock"] # 分类型临床特征(参考🔶1 - 41、🔶1 - 93) OUTCOME_DEF = {0: "censored", 1: "recovered", 2: "deceased"} # 结局类型定义(对应🔶1 - 51的σₙ) MAX_RECOVER_DAY = 31 # 恢复时间>30天设为31天,死亡设为32天(参考🔶1 - 60) # 更新文件路径 file_paths = { "metainfo": "/mnt/patients_metainfo (TARGET GDC,2025).csv", "mutation": "/mnt/patients_mutation (TARGET GDC,2025).csv", "outcomes": "/mnt/patients_outcomes (TARGET GDC,2025).csv", "mrna": "/mnt/mrna_zcores.xlsx", "cna": "/mnt/cna.xlsx" } # 数据加载与一致性校验(文档核心要求) # 加载已有的文件 df_metainfo = pd.read_csv(file_paths["metainfo"]) df_mutation = pd.read_csv(file_paths["mutation"]) df_outcomes = pd.read_csv(file_paths["outcomes"]) df_mrna = pd.ExcelFile(file_paths["mrna"]).parse() # 读取 Excel 文件 df_cna = pd.ExcelFile(file_paths["cna"]).parse() # 读取 Excel 文件 # 查看各文件基础信息 print("各文件患者数量与列名:") for name, df in [("metainfo", df_metainfo), ("mutation", df_mutation), ("outcomes", df_outcomes), ("mrna", df_mrna), ("cna", df_cna)]: print(f"- {name}文件: 患者数={len(df)}, 列数={len(df.columns)}, 列名={list(df.columns)[:5]}...") def unify_patient_id(df_list, base_df, id_col="patient_id"): """ 统一患者ID:以base_df(结局文件)的ID为基准,删除其他文件中不匹配的ID df_list: 待校验的DataFrame列表,base_df: 基准DataFrame(结局文件) """ # 基准ID集合 base_ids = set(base_df[id_col].unique()) print(f"基准结局文件患者数:{len(base_ids)}") # 校验并过滤每个文件 unified_dfs = [] for df in df_list: # 检查ID列是否存在 if id_col not in df.columns: raise ValueError(f"文件缺少{id_col}列,请检查格式!") # 筛选匹配ID df_filtered = df[df[id_col].isin(base_ids)].copy() # 统计删除数量 deleted = len(df) - len(df_filtered) print(f"筛选后{df.columns[0].split('_')[0]}文件:保留{len(df_filtered)}例,删除{deleted}例(ID不匹配)") unified_dfs.append(df_filtered) return unified_dfs + [base_df] # 返回所有筛选后的数据(含基准结局文件) # 执行ID统一(待校验文件:metainfo、mutation、mrna、cna) df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes = unify_patient_id( df_list=[df_metainfo, df_mutation, df_mrna, df_cna], base_df=df_outcomes ) # 再次校验所有文件ID数量是否一致 all_ids = [set(df["patient_id"].unique()) for df in [df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes]] assert len(set.intersection(*all_ids)) == len(all_ids[0]), "ID筛选后仍不匹配,请检查数据!" print(f"\n最终统一患者数:{len(set.intersection(*all_ids))}") def stat_missing_values(df_list, df_names): """统计每个文件的缺失值比例""" missing_stats = [] for df, name in zip(df_list, df_names): # 计算每列缺失率 missing_rate = (df.isnull().sum() / len(df) * 100).round(2) # 筛选缺失率>0的列 missing_cols = missing_rate[missing_rate > 0] if len(missing_cols) > 0: missing_stats.append({ "file": name, "missing_cols": dict(missing_cols), "max_missing_rate": missing_cols.max(), "min_missing_rate": missing_cols.min() }) else: missing_stats.append({ "file": name, "missing_cols": "无", "max_missing_rate": 0, "min_missing_rate": 0 }) # 转为DataFrame展示 df_missing = pd.DataFrame(missing_stats) return df_missing # 统计缺失值 df_missing = stat_missing_values( df_list=[df_metainfo, df_mutation, df_mrna, df_cna, df_outcomes], df_names=["metainfo", "mutation", "mrna", "cna", "outcomes"] ) print("各文件缺失值统计:") print(df_missing.to_string(index=False)) # 分文件特征预处理(严格对齐文档逻辑) def preprocess_metainfo(df_metainfo, id_col="patient_id"): """ 处理临床基础信息:分类特征二值化,数值特征归一化 分类特征(如gender、fever):0=无/女,1=有/男;数值特征(age):归一化到[0,1] """ df = df_metainfo.copy() # 1. 分类特征二值化(参考文档中症状/合并症的编码逻辑🔶1 - 41) cat_features = ["gender", "fever", "cough", "expectoration", "diabetes", "ards", "shock"] for feat in cat_features: if feat in df.columns: # 缺失值填0(无),非缺失值映射为0/1 df[feat] = df[feat].fillna(0) # 若原始值为字符串(如"男"/"女"),转换为0/1 if df[feat].dtype == "object": df[feat] = df[feat].map({"女": 0, "男": 1, "无": 0, "有": 1}).fillna(0) df[feat] = df[feat].astype(int) # 2. 数值特征归一化(年龄,参考文档🔶1 - 136的Min - Max归一化) num_features = ["age"] scaler = MinMaxScaler(feature_range=(0, 1)) for feat in num_features: if feat in df.columns: # 缺失值填均值 df[feat] = df[feat].fillna(df[feat].mean()) # 归一化 df[feat] = scaler.fit_transform(df[[feat]]).flatten() # 3. 保留ID和处理后的特征 keep_cols = [id_col] + cat_features + num_features keep_cols = [col for col in keep_cols if col in df.columns] df_processed = df[keep_cols] return df_processed, scaler # 执行处理 df_metainfo_processed, age_scaler = preprocess_metainfo(df_metainfo) print(f"\nmetainfo处理后:列数={len(df_metainfo_processed.columns)},示例:") print(df_metainfo_processed.head()) def preprocess_mutation(df_mutation, id_col="patient_id"): """ 处理基因突变特征:二值化(0=无突变,1=有突变),缺失值填0 """ df = df_mutation.copy() # 所有基因列(排除ID列) gene_cols = [col for col in df.columns if col != id_col] # 1. 缺失值填0(默认无突变) df[gene_cols] = df[gene_cols].fillna(0) # 2. 二值化(确保所有值为0/1,如原始为"突变"/"野生型"则映射) for col in gene_cols: if df[col].dtype == "object": df[col] = df[col].map({"野生型": 0, "突变": 1}).fillna(0) # 若为数值(如突变频率),>0视为1(有突变) else: df[col] = (df[col] > 0).astype(int) df_processed = df[[id_col] + gene_cols] return df_processed # 执行处理 df_mutation_processed = preprocess_mutation(df_mutation) print(f"\nmutation处理后:基因特征数={len(df_mutation_processed.columns)-1},示例:") print(df_mutation_processed.head()) def preprocess_mrna(df_mrna, id_col="patient_id"): """ 处理mRNA表达特征:z值已标准化,仅填充缺失值(填0,即与均值一致),转换为CSV格式 """ df = df_mrna.copy() # 1. 确保ID列存在且命名统一 if "sample_id" in df.columns and id_col not in df.columns: df.rename(columns={"sample_id": id_col}, inplace=True) # 2. 基因列(排除ID列) gene_cols = [col for col in df.columns if col != id_col] # 3. 缺失值填0(z值为0表示与均值一致,符合文档缺失值处理逻辑🔶1 - 136) df[gene_cols] = df[gene_cols].fillna(0) df_processed = df[[id_col] + gene_cols] return df_processed # 执行处理 df_mrna_processed = preprocess_mrna(df_mrna) print(f"\nmrna处理后:基因特征数={len(df_mrna_processed.columns)-1},示例:") print(df_mrna_processed.head()) def preprocess_cna(df_cna, id_col="patient_id"): """ 处理拷贝数变异特征:编码为-1(缺失)、0(正常)、1(扩增),缺失值填0 """ df = df_cna.copy() gene_cols = [col for col in df.columns if col != id_col] # 1. 缺失值填0(默认正常) df[gene_cols] = df[gene_cols].fillna(0) # 2. 编码(原始值如"缺失"/"正常"/"扩增"映射为-1/0/1) for col in gene_cols: if df[col].dtype == "object": df[col] = df[col].map({"缺失": -1, "正常": 0, "扩增": 1}).fillna(0) # 若为数值(如拷贝数),<0=-1,=0=0,>0=1 else: df[col] = np.where(df[col] < 0, -1, np.where(df[col] > 0, 1, 0)) df_processed = df[[id_col] + gene_cols] return df_processed # 执行处理 df_cna_processed = preprocess_cna(df_cna) print(f"\ncna处理后:基因特征数={len(df_cna_processed.columns)-1},示例:") print(df_cna_processed.head()) def preprocess_outcomes(df_outcomes, id_col="patient_id", max_recover_day=MAX_RECOVER_DAY): """ 处理结局变量:定义σₙ(0=截尾,1=恢复,2=死亡)和tₙ(结局时间) 恢复时间>30天设为31,死亡设为32(参考文档🔶1 - 60) """ df = df_outcomes.copy() # 1. 结局类型编码(σₙ) if "outcome_type" in df.columns: # 若原始为字符串,映射为0/1/2 if df["outcome_type"].dtype == "object": df["outcome_type"] = df["outcome_type"].map({ "censored": 0, "lost_to_follow_up": 0, # 截尾(失访) "recovered": 1, "discharged": 1, # 恢复(出院) "deceased": 2, "death": 2 # 死亡 }).fillna(0) # 缺失值视为截尾 else: # 若无outcome_type,按结局时间推断(参考文档🔶1 - 51) df["outcome_type"] = np.where( df["outcome_days"].isnull(), 0, # 无时间→截尾 np.where(df["outcome_days"] <= 30, 1, 2) # ≤30天→恢复,>30→死亡(暂设) ) # 2. 结局时间处理(tₙ) if "outcome_days" in df.columns: # 缺失值填7(截尾默认时间,参考文档🔶1 - 30) df["outcome_days"] = df["outcome_days"].fillna(7).astype(int) # 恢复患者:>30天设为31 df.loc[(df["outcome_type"] == 1) & (df["outcome_days"] > 30), "outcome_days"] = max_recover_day # 死亡患者:统一设为32 df.loc[df["outcome_type"] == 2, "outcome_days"] = 32 # 截尾患者:时间≤10天(参考文档🔶1 - 30) df.loc[df["outcome_type"] == 0, "outcome_days"] = np.clip(df.loc[df["outcome_type"] == 0, "outcome_days"], 3, 10) # 保留关键列 df_processed = df[[id_col, "outcome_type", "outcome_days"]] return df_processed # 执行处理 df_outcomes_processed = preprocess_outcomes(df_outcomes) print(f"\noutcomes处理后:结局分布={df_outcomes_processed['outcome_type'].value_counts().to_dict()},示例:") print(df_outcomes_processed.head()) # 多表整合为患者级多模态矩阵 def integrate_multimodal_data(df_list, id_col="patient_id"): """ 多表整合:以patient_id为键,左连接所有处理后的DataFrame df_list: 处理后的DataFrame列表(含metainfo、mutation、cna、mrna、blood_cell、outcomes) """ # 以第一个DataFrame为基础,逐步连接其他DataFrame integrated_df = df_list[0].copy() for df in df_list[1:]: integrated_df = integrated_df.merge(df, on=id_col, how="left") # 检查连接后是否有新增缺失值(理论上不应有,因ID已统一) new_missing = integrated_df.isnull().sum().sum() - integrated_df[id_col].isnull().sum() if new_missing > 0: print(f"连接{df.columns[1].split('_')[0]}后新增{new_missing}个缺失值,已填充为0") integrated_df = integrated_df.fillna(0) # 确保无重复行 integrated_df = integrated_df.drop_duplicates(subset=[id_col]) print(f"\n整合后多模态矩阵:患者数={len(integrated_df)},特征数={len(integrated_df.columns)-3}(不含ID、结局类型、结局时间)") return integrated_df # 执行整合(顺序:metainfo→mutation→mrna→cna→outcomes) df_integrated = integrate_multimodal_data([ df_metainfo_processed, df_mutation_processed, df_mrna_processed, df_cna_processed, df_outcomes_processed ]) # 查看整合结果 print("\n整合后数据示例(前5列+最后3列):") print(df_integrated.iloc[:, list(range(5)) + list(range(-3, 0))].head()) # 数据划分与输出(贴合文档验证策略) def split_and_save_data(df_integrated, id_col="patient_id", save_path="./processed_data/"): """ 数据划分:五折交叉验证划分训练/验证集,保存处理后数据(CSV格式) 参考文档🔶1 - 26的五折交叉验证逻辑 """ import os os.makedirs(save_path, exist_ok=True) # 1. 保存完整整合数据 df_integrated.to_csv(os.path.join(save_path, "integrated_patients_data.csv"), index=False) print(f"完整数据已保存至:{os.path.join(save_path, 'integrated_patients_data.csv')}") # 2. 五折交叉验证划分 kf = KFold(n_splits=5, shuffle=True, random_state=42) patients = df_integrated[id_col].unique() fold = 1 for train_idx, val_idx in kf.split(patients): # 划分训练/验证ID train_ids = patients[train_idx] val_ids = patients[val_idx] # 生成训练/验证集 df_train = df_integrated[df_integrated[id_col].isin(train_ids)] df_val = df_integrated[df_integrated[id_col].isin(val_ids)] # 保存 df_train.to_csv(os.path.join(save_path, f"train_fold{fold}.csv"), index=False) df_val.to_csv(os.path.join(save_path, f"val_fold{fold}.csv"), index=False) print(f"Fold{fold}:训练集{len(df_train)}例,验证集{len(df_val)}例,已保存") fold += 1 # 3. 保存特征说明(用于后续模型解释,参考文档FSR机制🔶1 - 61) feature_info = pd.DataFrame({ "feature_name": [col for col in df_integrated.columns if col not in [id_col, "outcome_type", "outcome_days"]], "feature_type": ["clinical" if col in df_metainfo_processed.columns else "mutation" if col in df_mutation_processed.columns else "mrna" if col in df_mrna_processed.columns else "cna" if col in df_cna_processed.columns else "outcome" if col in df_outcomes_processed.columns else ""], "processing_method": ["binary" if col in df_mutation_processed.columns or col in CLINICAL_CAT_FEATURES else "normalized" if col in df_metainfo_processed.columns else "z_score" if col in df_mrna_processed.columns else "ternary" if col in df_cna_processed.columns else "outcome_encoding" if col in df_outcomes_processed.columns else ""] }) feature_info.to_csv(os.path.join(save_path, "feature_metadata.csv"), index=False) print(f"特征说明已保存至:{os.path.join(save_path, 'feature_metadata.csv')}") return df_train, df_val # 执行划分与保存 df_train, df_val = split_and_save_data(df_integrated) # 关键结果可视化(验证处理效果) # 1. 结局分布可视化(参考文档🔶1 - 30) plt.figure(figsize=(12, 4)) # 结局类型分布 plt.subplot(1, 2, 1) outcome_counts = df_integrated["outcome_type"].value_counts().sort_index() outcome_labels = [OUTCOME_DEF[i] for i in outcome_counts.index] plt.bar(outcome_labels, outcome_counts.values, color=["lightblue", "lightgreen", "salmon"]) plt.title("患者结局类型分布(参考文档🔶1 - 30)") plt.ylabel("患者数") # 恢复时间分布(仅恢复患者) plt.subplot(1, 2, 2) recover_days = df_integrated[df_integrated["outcome_type"] == 1]["outcome_days"] plt.hist(recover_days, bins=10, color="lightgreen", edgecolor="black") plt.title("恢复患者时间分布(≤31天,参考文档🔶1 - 60)") plt.xlabel("恢复时间(天)") plt.ylabel("患者数") plt.tight_layout() plt.savefig("./processed_data/outcome_distribution.png", dpi=300, bbox_inches="tight") plt.close() print("\n结局分布图表已保存至:./processed_data/outcome_distribution.png") # 2. 关键生物标志物相关性(参考文档🔶1 - 94的Pearson分析) # 由于当前没有生物标志物数据,此部分代码先注释掉 # markers = ["albumin", "hemoglobin", "total_protein", "ldh"] # markers = [m for m in markers if m in df_integrated.columns] # recover_days = df_integrated[df_integrated["outcome_type"] == 1]["outcome_days"] # corr_results = [] # for marker in markers: # marker_vals = df_integrated[df_integrated["outcome_type"] == 1][marker] # corr, p_val = pearsonr(marker_vals, recover_days) # corr_results.append({"marker": marker, "pearson_corr": corr.round(3), "p_value": p_val.round(4)}) # df_corr = pd.DataFrame(corr_results) # print("\n关键生物标志物与恢复时间的Pearson相关性(参考文档🔶1 - 94):") # print(df_corr.to_string(index=False))
最新发布
10-27
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值