利用tf.ragged.boolean_mask完成选择mask压缩tensor

本文介绍了如何使用tf.ragged.boolean_mask处理含有对话状态和角色信息的Tensor,通过mask和筛选操作,保留角色为1的utterances状态,并提取最后一个这样的utterance。关键在于理解RaggedTensor的不规则维度处理方式。

现有如下输入:

1. [Batch_size, Seq_len, N_classes]的tensor T

2. [Batch_size, Seq_len]的mask矩阵 M

需求:根据M中的值来mask T,并去掉被mask掉的值

实际场景:主要是现在有一个对话的每个时刻的状态,和角色(0/1),需要取出角色为1的所有utterances的状态,在此基础上选取最后一个角色为1的utterance的状态

问题点:

若使用tf.boolean_mask,则会得到[?, N_classes], ?代表这个方法将每个样本筛选出来的样本压缩成一维,是一个不规则的维度,?数值小于Batch_size * Seq_len

正确操作:

使用tf.ragged.boolean_mask,保留原始的维度,利用不规则tensor特性,得到S=[Batch_size, ?, N_classes], 再使用tf.squeeze(S[:, :-1,:], 1),得到期望结果

注:Ragged Tensor不支持直接index,所以这里使用slice操作,再压缩中间维度。

import numpy as np import tensorflow as tf import pandas as pd import random import argparse from sklearn.model_selection import KFold, train_test_split from sklearn.metrics.pairwise import cosine_similarity from rdkit import Chem from rdkit.Chem import AllChem from tqdm import tqdm import tensorflow.keras.backend as K from tensorflow.keras import layers, Model 配置GPU内存动态分配 gpus = tf.config.list_physical_devices(‘GPU’) if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.list_logical_devices(‘GPU’) print(f"{len(gpus)} 物理GPU, {len(logical_gpus)} 逻辑GPU") except RuntimeError as e: print(e) 补充必要的工具函数 def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc.astype(np.float32) 初始化位置编码参数 dimn = 64 # 光谱编码维度 cnn_feature_dim = 64 # CNN输出特征维度 transformer_dim = 64 # Transformer特征维度 P = positional_encoding(256, transformer_dim, min_freq=1e2) 光谱数据增强 def augment_spectrum(mz_list, intensity_list, noise_factor=0.01): noisy_mz = [mz + np.random.normal(0, noise_factor) for mz in mz_list] noisy_intensity = [intensity * (1 + np.random.normal(0, noise_factor)) for intensity in intensity_list] noisy_intensity = [max(0, i) for i in noisy_intensity] return noisy_mz, noisy_intensity def prepro_specs_train(df, augment=True): df = df.reset_index(drop=True) valid = [] mz_intensity = df[‘Spectrum’].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity), disable=False, desc="预处理光谱数据"): mz_list, intensity_list = process_line(intensities) mz_list.append(float(df.at[idx, 'Total Exact Mass'])) if augment: mz_list, intensity_list = augment_spectrum(mz_list, intensity_list) round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return tf.ragged.constant(valid) def prepro_specs_test(df): return prepro_specs_train(df, augment=False) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in tqdm(rag_tensor, desc=“编码光谱数据”): mz_list = sample[0].numpy().tolist() intensity_list = sample[1].numpy().tolist() positions = [min(int(round(intensity * 100)), len(P)-1) for intensity in intensity_list] pos_enc = np.array([P[pos] for pos in positions]) if positions else np.zeros((1, dimn)) averaged_encoding = [np.mean(pos_enc[:, dim]) for dim in range(dimn)] to_pad.append(averaged_encoding) return np.array(to_pad, dtype=np.float32) 将Scaffold转换为摩根指纹 def scaffold_to_morgan(smiles, radius=2, nBits=2048): if pd.isna(smiles) or smiles == ‘’: return np.zeros(nBits, dtype=np.float32) mol = Chem.MolFromSmiles(smiles) if mol is None: return np.zeros(nBits, dtype=np.float32) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits) return np.array(fp, dtype=np.float32) 自定义Transformer编码器层 def transformer_encoder_layer(units, num_heads, dropout, name=“transformer_encoder_layer”): inputs = layers.Input(shape=(None, units), name=“inputs”) attention = layers.MultiHeadAttention( num_heads=num_heads, key_dim=units//num_heads, name="attention" )(inputs, inputs) attention = layers.Dropout(dropout)(attention) attention = layers.Add()([inputs, attention]) attention = layers.LayerNormalization(epsilon=1e-6)(attention) ffn = layers.Dense(units * 2, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(attention) ffn = layers.Dense(units, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(ffn) ffn = layers.Dropout(dropout)(ffn) outputs = layers.Add()([attention, ffn]) outputs = layers.LayerNormalization(epsilon=1e-6)(outputs) return Model(inputs=inputs, outputs=outputs, name=name) 构建CNN+Transformer模型 def build_cnn_transformer_encoder(input_dim=dimn, cnn_filters=cnn_feature_dim, transformer_dim=transformer_dim, num_layers=2, num_heads=2, dropout=0.3): inputs = layers.Input(shape=(input_dim,), name=“input_layer”) x = layers.Reshape((input_dim, 1))(inputs) x = layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误 x = layers.Conv1D(filters=cnn_filters, kernel_size=5, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误 if cnn_filters != transformer_dim: x = layers.Conv1D(filters=transformer_dim, kernel_size=1, padding='same')(x) pos_encoding = positional_encoding(input_dim, transformer_dim) pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32) pos_encoding = tf.expand_dims(pos_encoding, axis=0) x = layers.Add()([x, pos_encoding]) for i in range(num_layers): x = transformer_encoder_layer( units=transformer_dim, num_heads=num_heads, dropout=dropout, name=f"transformer_layer_{i}" )(x) x = layers.GlobalAveragePooling1D()(x) x = layers.Dense(2048, activation='relu')(x) x = layers.Dropout(0.2)(x) outputs = layers.Dense(2048, activation='relu', name="output_layer")(x) return Model(inputs=inputs, outputs=outputs, name="cnn_transformer_encoder") NT-Xent损失函数 def nt_xent_loss(y_true, y_pred, temperature=0.05): encoder_output = tf.nn.l2_normalize(y_pred, axis=1) morgan_fp = tf.nn.l2_normalize(y_true, axis=1) samples_per_group = 256 batch_size = tf.shape(encoder_output)[0] num_groups = batch_size // samples_per_group encoder_grouped = tf.reshape(encoder_output, (num_groups, samples_per_group, -1)) morgan_grouped = tf.reshape(morgan_fp, (num_groups, samples_per_group, -1)) similarity_matrix = tf.matmul(encoder_grouped, morgan_grouped, transpose_b=True) positive_similarity = similarity_matrix[:, 0, 0] base_mask = tf.logical_not(tf.eye(samples_per_group, dtype=tf.bool)) mask = tf.tile(tf.expand_dims(base_mask, 0), [num_groups, 1, 1]) group_similarities = similarity_matrix[:, 0, :] group_mask = mask[:, 0, :] negative_similarities = tf.boolean_mask(group_similarities, group_mask) numerator = tf.exp(positive_similarity / temperature) denominator = tf.reduce_sum(tf.exp(negative_similarities / temperature), axis=0) per_group_loss = -tf.math.log(numerator / (denominator + numerator)) return tf.reduce_mean(per_group_loss) 批次生成函数 def generate_batches(df, encoded_spectra, morgan_fps, groups_per_batch=2, shuffle=True): valid_smiles = [] smiles_groups = df.groupby(‘SMILES’) for smiles, group in smiles_groups: pos_count = len(group[group['Type'] == 'Pos']) neg_count = len(group[group['Type'] == 'Neg']) if pos_count == 1 and neg_count == 255: valid_smiles.append(smiles) if shuffle: random.shuffle(valid_smiles) for i in range(0, len(valid_smiles), groups_per_batch): batch_smiles = valid_smiles[i:i+groups_per_batch] if not batch_smiles: continue all_spectra = [] all_morgan = [] all_df = [] for smiles in batch_smiles: group = smiles_groups.get_group(smiles) pos_samples = group[group['Type'] == 'Pos'] neg_samples = group[group['Type'] == 'Neg'] ordered_group = pd.concat([pos_samples, neg_samples]) group_indices = ordered_group.index.tolist() all_spectra.append(encoded_spectra[group_indices]) all_morgan.append(morgan_fps[group_indices]) all_df.append(ordered_group) batch_spectra = np.concatenate(all_spectra, axis=0) batch_morgan = np.concatenate(all_morgan, axis=0) batch_df = pd.concat(all_df, ignore_index=True) yield batch_spectra, batch_morgan, batch_df 计算有效批次数 def count_valid_batches(df, groups_per_batch=2): valid_count = 0 for _, group in df.groupby(‘SMILES’): pos_count = len(group[group[‘Type’] == ‘Pos’]) neg_count = len(group[group[‘Type’] == ‘Neg’]) if pos_count == 1 and neg_count == 255: valid_count += 1 return (valid_count + groups_per_batch - 1) // groups_per_batch 计算Top1准确度 def calculate_top1_accuracy(model, test_df, test_spectra, test_morgan, groups_per_batch=2): correct = 0 total = 0 test_generator = generate_batches( test_df, test_spectra, test_morgan, groups_per_batch=groups_per_batch, shuffle=False ) test_batch_count = count_valid_batches(test_df, groups_per_batch=groups_per_batch) for _ in tqdm(range(test_batch_count), desc="计算Top1准确度"): batch_spectra, batch_morgan, _ = next(test_generator) encoder_output = model(batch_spectra) samples_per_group = 256 num_groups = len(batch_spectra) // samples_per_group for group_idx in range(num_groups): start_idx = group_idx * samples_per_group end_idx = start_idx + samples_per_group group_encoder = encoder_output[start_idx:end_idx] group_morgan = batch_morgan[start_idx:end_idx] similarities = cosine_similarity(group_encoder, group_morgan) pos_idx = 0 pos_similarities = similarities[pos_idx] max_sim_idx = np.argmax(pos_similarities) if max_sim_idx == pos_idx: correct += 1 total += 1 if total == 0: return 0.0 return correct / total 数据加载和预处理 def load_and_preprocess_data(csv_path, test_size=0.2, random_state=46): print(f"加载数据: {csv_path}") df = pd.read_csv(csv_path) valid_smiles = [] smiles_groups = df.groupby('SMILES') for smiles, group in smiles_groups: pos_count = len(group[group['Type'] == 'Pos']) neg_count = len(group[group['Type'] == 'Neg']) if pos_count == 1 and neg_count == 255: valid_smiles.append(smiles) print(f"有效SMILES组数: {len(valid_smiles)}") # 先划分出独立测试集 train_val_smiles, test_smiles = train_test_split( valid_smiles, test_size=test_size, random_state=random_state ) train_val_df = df[df['SMILES'].isin(train_val_smiles)].reset_index(drop=True) test_df = df[df['SMILES'].isin(test_smiles)].reset_index(drop=True) print(f"训练验证集大小: {len(train_val_df)}, 独立测试集大小: {len(test_df)}") print(f"训练验证SMILES数量: {len(train_val_smiles)}, 测试SMILES数量: {len(test_smiles)}") # 处理测试集 print("预处理测试集光谱数据...") test_rag_tensor = prepro_specs_test(test_df) test_encoded_spectra = encoding(test_rag_tensor, P, dimn) print("转换测试集Scaffold为摩根指纹...") test_df['morgan_fp'] = test_df['Scaffold'].apply(scaffold_to_morgan) test_morgan_fps = np.stack(test_df['morgan_fp'].values) return train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps 交叉验证训练函数 def cross_validate_model(train_val_df, train_val_smiles, hyperparams, n_splits=10, epochs=10): “”“执行十折交叉验证评估给定超参数”“” kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) fold_results = [] for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_smiles)): print(f"\n{'='*50}") print(f"开始第 {fold+1}/{n_splits} 折交叉验证") print(f"{'='*50}\n") # 划分当前折的训练集和验证集 train_smiles = [train_val_smiles[i] for i in train_idx] val_smiles = [train_val_smiles[i] for i in val_idx] fold_train_df = train_val_df[train_val_df['SMILES'].isin(train_smiles)].reset_index(drop=True) fold_val_df = train_val_df[train_val_df['SMILES'].isin(val_smiles)].reset_index(drop=True) # 预处理光谱数据 print("预处理当前折训练集光谱数据...") train_rag_tensor = prepro_specs_train(fold_train_df, augment=True) train_encoded_spectra = encoding(train_rag_tensor, P, dimn) print("预处理当前折验证集光谱数据...") val_rag_tensor = prepro_specs_test(fold_val_df) val_encoded_spectra = encoding(val_rag_tensor, P, dimn) # 处理摩根指纹 print("转换当前折训练集Scaffold为摩根指纹...") fold_train_df['morgan_fp'] = fold_train_df['Scaffold'].apply(scaffold_to_morgan) train_morgan_fps = np.stack(fold_train_df['morgan_fp'].values) print("转换当前折验证集Scaffold为摩根指纹...") fold_val_df['morgan_fp'] = fold_val_df['Scaffold'].apply(scaffold_to_morgan) val_morgan_fps = np.stack(fold_val_df['morgan_fp'].values) # 构建模型 model = build_cnn_transformer_encoder( input_dim=dimn, cnn_filters=hyperparams['cnn_filters'], transformer_dim=hyperparams['transformer_dim'], num_layers=hyperparams['num_layers'], num_heads=hyperparams['num_heads'], dropout=hyperparams['dropout'] ) # 编译模型 optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']) model.compile(optimizer=optimizer, loss=nt_xent_loss) # 计算批次数 train_batch_count = count_valid_batches(fold_train_df, groups_per_batch=hyperparams['groups_per_batch']) val_batch_count = count_valid_batches(fold_val_df, groups_per_batch=hyperparams['groups_per_batch']) print(f"当前折训练批次数: {train_batch_count}, 验证批次数: {val_batch_count}") # 训练模型 best_val_acc = 0.0 for epoch in range(epochs): print(f"\n第 {epoch+1}/{epochs} 轮") train_total_loss = 0.0 train_generator = generate_batches( fold_train_df, train_encoded_spectra, train_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=True ) # 训练 for _ in tqdm(range(train_batch_count), desc="训练"): batch_spectra, batch_morgan, _ = next(train_generator) loss = model.train_on_batch(batch_spectra, batch_morgan) train_total_loss += loss train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0 # 验证 val_generator = generate_batches( fold_val_df, val_encoded_spectra, val_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=False ) val_total_loss = 0.0 for _ in range(val_batch_count): batch_spectra, batch_morgan, _ = next(val_generator) loss = model.test_on_batch(batch_spectra, batch_morgan) val_total_loss += loss val_avg_loss = val_total_loss / val_batch_count if val_batch_count > 0 else 0.0 # 计算验证集准确度 val_acc = calculate_top1_accuracy( model, fold_val_df, val_encoded_spectra, val_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'] ) print(f"训练损失: {train_avg_loss:.6f}, 验证损失: {val_avg_loss:.6f}, 验证准确度: {val_acc:.6f}") # 保存当前折中最佳模型 if val_acc > best_val_acc: best_val_acc = val_acc fold_results.append(best_val_acc) print(f"\n第 {fold+1} 折最佳验证准确度: {best_val_acc:.6f}") # 清除内存 K.clear_session() # 计算交叉验证结果 mean_acc = np.mean(fold_results) std_acc = np.std(fold_results) print(f"\n{'='*50}") print(f"十折交叉验证结果: {mean_acc:.6f} ± {std_acc:.6f}") print(f"{'='*50}\n") return mean_acc, std_acc, fold_results 在完整训练集上训练最终模型 def train_final_model(train_val_df, hyperparams, epochs=10): print(“\n开始在完整训练集上训练最终模型…”) # 预处理所有训练数据 print("预处理完整训练集光谱数据...") train_rag_tensor = prepro_specs_train(train_val_df, augment=True) train_encoded_spectra = encoding(train_rag_tensor, P, dimn) print("转换完整训练集Scaffold为摩根指纹...") train_val_df['morgan_fp'] = train_val_df['Scaffold'].apply(scaffold_to_morgan) train_morgan_fps = np.stack(train_val_df['morgan_fp'].values) # 构建模型 model = build_cnn_transformer_encoder( input_dim=dimn, cnn_filters=hyperparams['cnn_filters'], transformer_dim=hyperparams['transformer_dim'], num_layers=hyperparams['num_layers'], num_heads=hyperparams['num_heads'], dropout=hyperparams['dropout'] ) # 编译模型 optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']) model.compile(optimizer=optimizer, loss=nt_xent_loss) # 计算批次数 train_batch_count = count_valid_batches(train_val_df, groups_per_batch=hyperparams['groups_per_batch']) print(f"完整训练集批次数: {train_batch_count}") # 训练模型 for epoch in range(epochs): print(f"\n第 {epoch+1}/{epochs} 轮") train_total_loss = 0.0 train_generator = generate_batches( train_val_df, train_encoded_spectra, train_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=True ) # 训练 for _ in tqdm(range(train_batch_count), desc="训练"): batch_spectra, batch_morgan, _ = next(train_generator) loss = model.train_on_batch(batch_spectra, batch_morgan) train_total_loss += loss train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0 print(f"训练损失: {train_avg_loss:.6f}") return model 主函数 def main(args): # 加载数据并划分训练验证集和独立测试集 train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps = load_and_preprocess_data( args.data, test_size=args.test_size ) # 定义超参数搜索空间 - 修复了参数格式,每个字典代表一组完整的超参数组合 param_grid = [ { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch } ] # 超参数优化 best_score = -1 best_params = None results = [] print("\n开始超参数搜索和十折交叉验证...") for i, params in enumerate(param_grid): print(f"\n{'#'*50}") print(f"测试第 {i+1}/{len(param_grid)} 组超参数: {params}") print(f"{'#'*50}\n") mean_acc, std_acc, fold_results = cross_validate_model( train_val_df, train_val_smiles, params, n_splits=10, epochs=args.epochs ) results.append({ 'params': params, 'mean_acc': mean_acc, 'std_acc': std_acc, 'fold_results': fold_results }) if mean_acc > best_score: best_score = mean_acc best_params = params # 输出最佳超参数 print("\n" + "="*70) print("超参数搜索完成!") print(f"最佳超参数: {best_params}") print(f"最佳交叉验证准确度: {best_score:.6f}") print("="*70 + "\n") # 使用最佳超参数训练最终模型 final_model = train_final_model( train_val_df, best_params, epochs=args.epochs ) # 在独立测试集上评估 print("\n在独立测试集上评估最终模型...") test_acc = calculate_top1_accuracy( final_model, test_df, test_encoded_spectra, test_morgan_fps, groups_per_batch=best_params['groups_per_batch'] ) print(f"\n{'='*70}") print(f"独立测试集Top1准确度: {test_acc:.6f}") print("="*70 + "\n") # 保存模型 final_model.save(args.output) print(f"最终模型已保存至: {args.output}") # 保存超参数搜索结果 import json with open('hyperparameter_results.json', 'w') as f: json.dump(results, f, indent=2, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else x) print("超参数搜索结果已保存至 hyperparameter_results.json") if name == “main”: parser = argparse.ArgumentParser(description=‘带十折交叉验证的质谱数据对比学习模型(CNN+Transformer)’) parser.add_argument(‘–data’, type=str, default=‘/home/admin123/code/骨架256.csv’, help=‘CSV数据文件路径’) parser.add_argument(‘–epochs’, type=int, default=1000, help=‘训练轮数’) parser.add_argument(‘–test_size’, type=float, default=0.2, help=‘独立测试集比例’) parser.add_argument(‘–groups_per_batch’, type=int, default=8, choices=range(1, 65), help=‘每个批次包含的SMILES组数(1到64之间)’) parser.add_argument(‘–output’, type=str, default=‘best_cnn_transformer_encoder.h5’, help=‘最终模型保存路径’) args = parser.parse_args() main(args)修改代码确保计算损失时提取到的正样本是来自group['Type'] == 'Pos',写出修改后的完整代码
最新发布
08-20
import numpy as np import tensorflow as tf import pandas as pd import random import argparse from sklearn.model_selection import KFold, train_test_split from sklearn.metrics.pairwise import cosine_similarity from rdkit import Chem from rdkit.Chem import AllChem from tqdm import tqdm import tensorflow.keras.backend as K from tensorflow.keras import layers, Model # 配置GPU内存动态分配 gpus = tf.config.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.list_logical_devices('GPU') print(f"{len(gpus)} 物理GPU, {len(logical_gpus)} 逻辑GPU") except RuntimeError as e: print(e) # 补充必要的工具函数 def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc.astype(np.float32) # 初始化位置编码参数 dimn = 64 # 光谱编码维度 cnn_feature_dim = 64 # CNN输出特征维度 transformer_dim = 64 # Transformer特征维度 P = positional_encoding(256, transformer_dim, min_freq=1e2) # 光谱数据增强 def augment_spectrum(mz_list, intensity_list, noise_factor=0.01): noisy_mz = [mz + np.random.normal(0, noise_factor) for mz in mz_list] noisy_intensity = [intensity * (1 + np.random.normal(0, noise_factor)) for intensity in intensity_list] noisy_intensity = [max(0, i) for i in noisy_intensity] return noisy_mz, noisy_intensity def prepro_specs_train(df, augment=True): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity), disable=False, desc="预处理光谱数据"): mz_list, intensity_list = process_line(intensities) mz_list.append(float(df.at[idx, 'Total Exact Mass'])) if augment: mz_list, intensity_list = augment_spectrum(mz_list, intensity_list) round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return tf.ragged.constant(valid) def prepro_specs_test(df): return prepro_specs_train(df, augment=False) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in tqdm(rag_tensor, desc="编码光谱数据"): mz_list = sample[0].numpy().tolist() intensity_list = sample[1].numpy().tolist() positions = [min(int(round(intensity * 100)), len(P)-1) for intensity in intensity_list] pos_enc = np.array([P[pos] for pos in positions]) if positions else np.zeros((1, dimn)) averaged_encoding = [np.mean(pos_enc[:, dim]) for dim in range(dimn)] to_pad.append(averaged_encoding) return np.array(to_pad, dtype=np.float32) # 将Scaffold转换为摩根指纹 def scaffold_to_morgan(smiles, radius=2, nBits=2048): if pd.isna(smiles) or smiles == '': return np.zeros(nBits, dtype=np.float32) mol = Chem.MolFromSmiles(smiles) if mol is None: return np.zeros(nBits, dtype=np.float32) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits) return np.array(fp, dtype=np.float32) # 自定义Transformer编码器层 def transformer_encoder_layer(units, num_heads, dropout, name="transformer_encoder_layer"): inputs = layers.Input(shape=(None, units), name="inputs") attention = layers.MultiHeadAttention( num_heads=num_heads, key_dim=units//num_heads, name="attention" )(inputs, inputs) attention = layers.Dropout(dropout)(attention) attention = layers.Add()([inputs, attention]) attention = layers.LayerNormalization(epsilon=1e-6)(attention) ffn = layers.Dense(units * 2, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(attention) ffn = layers.Dense(units, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(ffn) ffn = layers.Dropout(dropout)(ffn) outputs = layers.Add()([attention, ffn]) outputs = layers.LayerNormalization(epsilon=1e-6)(outputs) return Model(inputs=inputs, outputs=outputs, name=name) # 构建CNN+Transformer模型 def build_cnn_transformer_encoder(input_dim=dimn, cnn_filters=cnn_feature_dim, transformer_dim=transformer_dim, num_layers=2, num_heads=2, dropout=0.3): inputs = layers.Input(shape=(input_dim,), name="input_layer") x = layers.Reshape((input_dim, 1))(inputs) x = layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误 x = layers.Conv1D(filters=cnn_filters, kernel_size=5, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误 if cnn_filters != transformer_dim: x = layers.Conv1D(filters=transformer_dim, kernel_size=1, padding='same')(x) pos_encoding = positional_encoding(input_dim, transformer_dim) pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32) pos_encoding = tf.expand_dims(pos_encoding, axis=0) x = layers.Add()([x, pos_encoding]) for i in range(num_layers): x = transformer_encoder_layer( units=transformer_dim, num_heads=num_heads, dropout=dropout, name=f"transformer_layer_{i}" )(x) x = layers.GlobalAveragePooling1D()(x) x = layers.Dense(2048, activation='relu')(x) x = layers.Dropout(0.2)(x) outputs = layers.Dense(2048, activation='relu', name="output_layer")(x) return Model(inputs=inputs, outputs=outputs, name="cnn_transformer_encoder") # NT-Xent损失函数 def nt_xent_loss(y_true, y_pred, temperature=0.05): encoder_output = tf.nn.l2_normalize(y_pred, axis=1) morgan_fp = tf.nn.l2_normalize(y_true, axis=1) samples_per_group = 256 batch_size = tf.shape(encoder_output)[0] num_groups = batch_size // samples_per_group encoder_grouped = tf.reshape(encoder_output, (num_groups, samples_per_group, -1)) morgan_grouped = tf.reshape(morgan_fp, (num_groups, samples_per_group, -1)) similarity_matrix = tf.matmul(encoder_grouped, morgan_grouped, transpose_b=True) positive_similarity = similarity_matrix[:, 0, 0] base_mask = tf.logical_not(tf.eye(samples_per_group, dtype=tf.bool)) mask = tf.tile(tf.expand_dims(base_mask, 0), [num_groups, 1, 1]) group_similarities = similarity_matrix[:, 0, :] group_mask = mask[:, 0, :] negative_similarities = tf.boolean_mask(group_similarities, group_mask) numerator = tf.exp(positive_similarity / temperature) denominator = tf.reduce_sum(tf.exp(negative_similarities / temperature), axis=0) per_group_loss = -tf.math.log(numerator / (denominator + numerator)) return tf.reduce_mean(per_group_loss) # 批次生成函数 def generate_batches(df, encoded_spectra, morgan_fps, groups_per_batch=2, shuffle=True): valid_smiles = [] smiles_groups = df.groupby('SMILES') for smiles, group in smiles_groups: pos_count = len(group[group['Type'] == 'Pos']) neg_count = len(group[group['Type'] == 'Neg']) if pos_count == 1 and neg_count == 255: valid_smiles.append(smiles) if shuffle: random.shuffle(valid_smiles) for i in range(0, len(valid_smiles), groups_per_batch): batch_smiles = valid_smiles[i:i+groups_per_batch] if not batch_smiles: continue all_spectra = [] all_morgan = [] all_df = [] for smiles in batch_smiles: group = smiles_groups.get_group(smiles) pos_samples = group[group['Type'] == 'Pos'] neg_samples = group[group['Type'] == 'Neg'] ordered_group = pd.concat([pos_samples, neg_samples]) group_indices = ordered_group.index.tolist() all_spectra.append(encoded_spectra[group_indices]) all_morgan.append(morgan_fps[group_indices]) all_df.append(ordered_group) batch_spectra = np.concatenate(all_spectra, axis=0) batch_morgan = np.concatenate(all_morgan, axis=0) batch_df = pd.concat(all_df, ignore_index=True) yield batch_spectra, batch_morgan, batch_df # 计算有效批次数 def count_valid_batches(df, groups_per_batch=2): valid_count = 0 for _, group in df.groupby('SMILES'): pos_count = len(group[group['Type'] == 'Pos']) neg_count = len(group[group['Type'] == 'Neg']) if pos_count == 1 and neg_count == 255: valid_count += 1 return (valid_count + groups_per_batch - 1) // groups_per_batch # 计算Top1准确度 def calculate_top1_accuracy(model, test_df, test_spectra, test_morgan, groups_per_batch=2): correct = 0 total = 0 test_generator = generate_batches( test_df, test_spectra, test_morgan, groups_per_batch=groups_per_batch, shuffle=False ) test_batch_count = count_valid_batches(test_df, groups_per_batch=groups_per_batch) for _ in tqdm(range(test_batch_count), desc="计算Top1准确度"): batch_spectra, batch_morgan, _ = next(test_generator) encoder_output = model(batch_spectra) samples_per_group = 256 num_groups = len(batch_spectra) // samples_per_group for group_idx in range(num_groups): start_idx = group_idx * samples_per_group end_idx = start_idx + samples_per_group group_encoder = encoder_output[start_idx:end_idx] group_morgan = batch_morgan[start_idx:end_idx] similarities = cosine_similarity(group_encoder, group_morgan) pos_idx = 0 pos_similarities = similarities[pos_idx] max_sim_idx = np.argmax(pos_similarities) if max_sim_idx == pos_idx: correct += 1 total += 1 if total == 0: return 0.0 return correct / total # 数据加载和预处理 def load_and_preprocess_data(csv_path, test_size=0.2, random_state=46): print(f"加载数据: {csv_path}") df = pd.read_csv(csv_path) valid_smiles = [] smiles_groups = df.groupby('SMILES') for smiles, group in smiles_groups: pos_count = len(group[group['Type'] == 'Pos']) neg_count = len(group[group['Type'] == 'Neg']) if pos_count == 1 and neg_count == 255: valid_smiles.append(smiles) print(f"有效SMILES组数: {len(valid_smiles)}") # 先划分出独立测试集 train_val_smiles, test_smiles = train_test_split( valid_smiles, test_size=test_size, random_state=random_state ) train_val_df = df[df['SMILES'].isin(train_val_smiles)].reset_index(drop=True) test_df = df[df['SMILES'].isin(test_smiles)].reset_index(drop=True) print(f"训练验证集大小: {len(train_val_df)}, 独立测试集大小: {len(test_df)}") print(f"训练验证SMILES数量: {len(train_val_smiles)}, 测试SMILES数量: {len(test_smiles)}") # 处理测试集 print("预处理测试集光谱数据...") test_rag_tensor = prepro_specs_test(test_df) test_encoded_spectra = encoding(test_rag_tensor, P, dimn) print("转换测试集Scaffold为摩根指纹...") test_df['morgan_fp'] = test_df['Scaffold'].apply(scaffold_to_morgan) test_morgan_fps = np.stack(test_df['morgan_fp'].values) return train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps # 交叉验证训练函数 def cross_validate_model(train_val_df, train_val_smiles, hyperparams, n_splits=10, epochs=10): """执行十折交叉验证评估给定超参数""" kf = KFold(n_splits=n_splits, shuffle=True, random_state=42) fold_results = [] for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_smiles)): print(f"\n{'='*50}") print(f"开始第 {fold+1}/{n_splits} 折交叉验证") print(f"{'='*50}\n") # 划分当前折的训练集和验证集 train_smiles = [train_val_smiles[i] for i in train_idx] val_smiles = [train_val_smiles[i] for i in val_idx] fold_train_df = train_val_df[train_val_df['SMILES'].isin(train_smiles)].reset_index(drop=True) fold_val_df = train_val_df[train_val_df['SMILES'].isin(val_smiles)].reset_index(drop=True) # 预处理光谱数据 print("预处理当前折训练集光谱数据...") train_rag_tensor = prepro_specs_train(fold_train_df, augment=True) train_encoded_spectra = encoding(train_rag_tensor, P, dimn) print("预处理当前折验证集光谱数据...") val_rag_tensor = prepro_specs_test(fold_val_df) val_encoded_spectra = encoding(val_rag_tensor, P, dimn) # 处理摩根指纹 print("转换当前折训练集Scaffold为摩根指纹...") fold_train_df['morgan_fp'] = fold_train_df['Scaffold'].apply(scaffold_to_morgan) train_morgan_fps = np.stack(fold_train_df['morgan_fp'].values) print("转换当前折验证集Scaffold为摩根指纹...") fold_val_df['morgan_fp'] = fold_val_df['Scaffold'].apply(scaffold_to_morgan) val_morgan_fps = np.stack(fold_val_df['morgan_fp'].values) # 构建模型 model = build_cnn_transformer_encoder( input_dim=dimn, cnn_filters=hyperparams['cnn_filters'], transformer_dim=hyperparams['transformer_dim'], num_layers=hyperparams['num_layers'], num_heads=hyperparams['num_heads'], dropout=hyperparams['dropout'] ) # 编译模型 optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']) model.compile(optimizer=optimizer, loss=nt_xent_loss) # 计算批次数 train_batch_count = count_valid_batches(fold_train_df, groups_per_batch=hyperparams['groups_per_batch']) val_batch_count = count_valid_batches(fold_val_df, groups_per_batch=hyperparams['groups_per_batch']) print(f"当前折训练批次数: {train_batch_count}, 验证批次数: {val_batch_count}") # 训练模型 best_val_acc = 0.0 for epoch in range(epochs): print(f"\n第 {epoch+1}/{epochs} 轮") train_total_loss = 0.0 train_generator = generate_batches( fold_train_df, train_encoded_spectra, train_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=True ) # 训练 for _ in tqdm(range(train_batch_count), desc="训练"): batch_spectra, batch_morgan, _ = next(train_generator) loss = model.train_on_batch(batch_spectra, batch_morgan) train_total_loss += loss train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0 # 验证 val_generator = generate_batches( fold_val_df, val_encoded_spectra, val_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=False ) val_total_loss = 0.0 for _ in range(val_batch_count): batch_spectra, batch_morgan, _ = next(val_generator) loss = model.test_on_batch(batch_spectra, batch_morgan) val_total_loss += loss val_avg_loss = val_total_loss / val_batch_count if val_batch_count > 0 else 0.0 # 计算验证集准确度 val_acc = calculate_top1_accuracy( model, fold_val_df, val_encoded_spectra, val_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'] ) print(f"训练损失: {train_avg_loss:.6f}, 验证损失: {val_avg_loss:.6f}, 验证准确度: {val_acc:.6f}") # 保存当前折中最佳模型 if val_acc > best_val_acc: best_val_acc = val_acc fold_results.append(best_val_acc) print(f"\n第 {fold+1} 折最佳验证准确度: {best_val_acc:.6f}") # 清除内存 K.clear_session() # 计算交叉验证结果 mean_acc = np.mean(fold_results) std_acc = np.std(fold_results) print(f"\n{'='*50}") print(f"十折交叉验证结果: {mean_acc:.6f} ± {std_acc:.6f}") print(f"{'='*50}\n") return mean_acc, std_acc, fold_results # 在完整训练集上训练最终模型 def train_final_model(train_val_df, hyperparams, epochs=10): print("\n开始在完整训练集上训练最终模型...") # 预处理所有训练数据 print("预处理完整训练集光谱数据...") train_rag_tensor = prepro_specs_train(train_val_df, augment=True) train_encoded_spectra = encoding(train_rag_tensor, P, dimn) print("转换完整训练集Scaffold为摩根指纹...") train_val_df['morgan_fp'] = train_val_df['Scaffold'].apply(scaffold_to_morgan) train_morgan_fps = np.stack(train_val_df['morgan_fp'].values) # 构建模型 model = build_cnn_transformer_encoder( input_dim=dimn, cnn_filters=hyperparams['cnn_filters'], transformer_dim=hyperparams['transformer_dim'], num_layers=hyperparams['num_layers'], num_heads=hyperparams['num_heads'], dropout=hyperparams['dropout'] ) # 编译模型 optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate']) model.compile(optimizer=optimizer, loss=nt_xent_loss) # 计算批次数 train_batch_count = count_valid_batches(train_val_df, groups_per_batch=hyperparams['groups_per_batch']) print(f"完整训练集批次数: {train_batch_count}") # 训练模型 for epoch in range(epochs): print(f"\n第 {epoch+1}/{epochs} 轮") train_total_loss = 0.0 train_generator = generate_batches( train_val_df, train_encoded_spectra, train_morgan_fps, groups_per_batch=hyperparams['groups_per_batch'], shuffle=True ) # 训练 for _ in tqdm(range(train_batch_count), desc="训练"): batch_spectra, batch_morgan, _ = next(train_generator) loss = model.train_on_batch(batch_spectra, batch_morgan) train_total_loss += loss train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0 print(f"训练损失: {train_avg_loss:.6f}") return model # 主函数 def main(args): # 加载数据并划分训练验证集和独立测试集 train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps = load_and_preprocess_data( args.data, test_size=args.test_size ) # 定义超参数搜索空间 - 修复了参数格式,每个字典代表一组完整的超参数组合 param_grid = [ { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 1, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 2, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.3, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 1e-4, 'groups_per_batch': args.groups_per_batch }, { 'cnn_filters': 64, 'transformer_dim': 64, 'num_layers': 2, 'num_heads': 4, 'dropout': 0.4, 'learning_rate': 5e-5, 'groups_per_batch': args.groups_per_batch } ] # 超参数优化 best_score = -1 best_params = None results = [] print("\n开始超参数搜索和十折交叉验证...") for i, params in enumerate(param_grid): print(f"\n{'#'*50}") print(f"测试第 {i+1}/{len(param_grid)} 组超参数: {params}") print(f"{'#'*50}\n") mean_acc, std_acc, fold_results = cross_validate_model( train_val_df, train_val_smiles, params, n_splits=10, epochs=args.epochs ) results.append({ 'params': params, 'mean_acc': mean_acc, 'std_acc': std_acc, 'fold_results': fold_results }) if mean_acc > best_score: best_score = mean_acc best_params = params # 输出最佳超参数 print("\n" + "="*70) print("超参数搜索完成!") print(f"最佳超参数: {best_params}") print(f"最佳交叉验证准确度: {best_score:.6f}") print("="*70 + "\n") # 使用最佳超参数训练最终模型 final_model = train_final_model( train_val_df, best_params, epochs=args.epochs ) # 在独立测试集上评估 print("\n在独立测试集上评估最终模型...") test_acc = calculate_top1_accuracy( final_model, test_df, test_encoded_spectra, test_morgan_fps, groups_per_batch=best_params['groups_per_batch'] ) print(f"\n{'='*70}") print(f"独立测试集Top1准确度: {test_acc:.6f}") print("="*70 + "\n") # 保存模型 final_model.save(args.output) print(f"最终模型已保存至: {args.output}") # 保存超参数搜索结果 import json with open('hyperparameter_results.json', 'w') as f: json.dump(results, f, indent=2, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else x) print("超参数搜索结果已保存至 hyperparameter_results.json") if __name__ == "__main__": parser = argparse.ArgumentParser(description='带十折交叉验证的质谱数据对比学习模型(CNN+Transformer)') parser.add_argument('--data', type=str, default='/home/admin123/code/骨架256.csv', help='CSV数据文件路径') parser.add_argument('--epochs', type=int, default=1000, help='训练轮数') parser.add_argument('--test_size', type=float, default=0.2, help='独立测试集比例') parser.add_argument('--groups_per_batch', type=int, default=8, choices=range(1, 65), help='每个批次包含的SMILES组数(1到64之间)') parser.add_argument('--output', type=str, default='best_cnn_transformer_encoder.h5', help='最终模型保存路径') args = parser.parse_args() main(args)修改代码确保计算损失时提取到的正样本是来自group['Type'] == 'Pos'
08-20
import numpy as np import tensorflow as tf # 配置GPU内存动态分配 gpus = tf.config.list_physical_devices('GPU') if gpus: try: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.list_logical_devices('GPU') print(f"{len(gpus)} 物理GPU, {len(logical_gpus)} 逻辑GPU") except RuntimeError as e: print(e) from tensorflow.keras import layers, Model, callbacks from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem.Scaffolds import MurckoScaffold import pandas as pd from tqdm import tqdm import random from multiprocessing import Pool, cpu_count import tensorflow.keras.backend as K from sklearn.model_selection import train_test_split from sklearn.metrics.pairwise import cosine_similarity # 补充必要的工具函数 def positional_encoding(max_position, d_model, min_freq=1e-6): position = np.arange(max_position) freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model) pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1) pos_enc[:, ::2] = np.cos(pos_enc[:, ::2]) pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2]) return pos_enc.astype(np.float32) # 初始化位置编码参数 - 修复维度不匹配问题 dimn = 64 # 光谱编码维度,与transformer_dim保持一致 cnn_feature_dim = 64 # CNN输出特征维度 transformer_dim = 64 # Transformer特征维度 P = positional_encoding(256, transformer_dim, min_freq=1e2) # 适配Transformer维度 # 新增:光谱数据增强(添加微小噪声) def augment_spectrum(mz_list, intensity_list, noise_factor=0.01): """对光谱数据添加轻微噪声,增强泛化能力""" noisy_mz = [mz + np.random.normal(0, noise_factor) for mz in mz_list] noisy_intensity = [intensity * (1 + np.random.normal(0, noise_factor)) for intensity in intensity_list] # 确保强度非负 noisy_intensity = [max(0, i) for i in noisy_intensity] return noisy_mz, noisy_intensity def prepro_specs_train(df, augment=True): df = df.reset_index(drop=True) valid = [] mz_intensity = df['Spectrum'].to_list() def process_line(line): pairs = line.split() mz_list = [] intensity_list = [] for pair in pairs: mz, intensity = pair.split(':') mz_list.append(float(mz)) intensity_list.append(float(intensity)) return mz_list, intensity_list for idx, intensities in tqdm(enumerate(mz_intensity), disable=False, desc="预处理光谱数据"): mz_list, intensity_list = process_line(intensities) mz_list.append(float(df.at[idx, 'Total Exact Mass'])) # 加入 precursor m/z # 数据增强(仅训练集使用) if augment: mz_list, intensity_list = augment_spectrum(mz_list, intensity_list) # 保留两位小数 round_mz_list = [round(float(mz), 2) for mz in mz_list] round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list] valid.append([round_mz_list, round_intensity_list]) return tf.ragged.constant(valid) # 测试集预处理(关闭增强) def prepro_specs_test(df): return prepro_specs_train(df, augment=False) def encoding(rag_tensor, P, dimn): to_pad = [] for sample in tqdm(rag_tensor, desc="编码光谱数据"): # 提取mz和强度数据 mz_list = sample[0].numpy().tolist() intensity_list = sample[1].numpy().tolist() # 使用强度作为位置索引(缩放至位置编码范围) positions = [min(int(round(intensity * 100)), len(P)-1) for intensity in intensity_list] # 获取对应的位置编码并聚合 pos_enc = np.array([P[pos] for pos in positions]) if positions else np.zeros((1, dimn)) averaged_encoding = [np.mean(pos_enc[:, dim]) for dim in range(dimn)] to_pad.append(averaged_encoding) return np.array(to_pad, dtype=np.float32) def trun_n_d(n, d): return n if not n.find('.') + 1 else n[:n.find('.') + d + 1] # 将Scaffold转换为摩根指纹 def scaffold_to_morgan(smiles, radius=2, nBits=2048): if pd.isna(smiles) or smiles == '': return np.zeros(nBits, dtype=np.float32) mol = Chem.MolFromSmiles(smiles) if mol is None: return np.zeros(nBits, dtype=np.float32) fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits) return np.array(fp, dtype=np.float32) # 自定义Transformer编码器层(增强正则化) def transformer_encoder_layer(units, num_heads, dropout, name="transformer_encoder_layer"): inputs = layers.Input(shape=(None, units), name="inputs") # 多头自注意力机制 attention = layers.MultiHeadAttention( num_heads=num_heads, key_dim=units//num_heads, name="attention" )(inputs, inputs) # 残差连接 + 层归一化 + 增强dropout attention = layers.Dropout(dropout)(attention) attention = layers.Add()([inputs, attention]) attention = layers.LayerNormalization(epsilon=1e-6)(attention) # 前馈网络 + L2正则化 ffn = layers.Dense(units * 2, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(attention) ffn = layers.Dense(units, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(ffn) ffn = layers.Dropout(dropout)(ffn) # 残差连接 + 层归一化 outputs = layers.Add()([attention, ffn]) outputs = layers.LayerNormalization(epsilon=1e-6)(outputs) return Model(inputs=inputs, outputs=outputs, name=name) # 构建CNN+Transformer模型(先局部特征提取,再全局依赖建模) def build_cnn_transformer_encoder(input_dim=dimn, cnn_filters=cnn_feature_dim, transformer_dim=transformer_dim, num_layers=2, num_heads=2, dropout=0.3): """ CNN+Transformer混合模型: 1. CNN部分:使用1D卷积提取光谱局部特征 2. Transformer部分:捕捉全局特征依赖关系 """ # 输入层 (样本数, 光谱编码维度) inputs = layers.Input(shape=(input_dim,), name="input_layer") # 调整维度以适应CNN (添加通道维度) x = layers.Reshape((input_dim, 1))(inputs) # (None, 64, 1) # CNN部分:提取局部特征 # 第一卷积块 x = layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) # (None, 64, 32) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 第二卷积块 x = layers.Conv1D(filters=cnn_filters, kernel_size=5, padding='same', activation='gelu', kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x) # (None, 64, 64) x = layers.BatchNormalization()(x) x = layers.Dropout(dropout/2)(x) # 特征维度转换:匹配Transformer输入维度 if cnn_filters != transformer_dim: x = layers.Conv1D(filters=transformer_dim, kernel_size=1, padding='same')(x) # (None, 64, 64) # 添加位置编码 pos_encoding = positional_encoding(input_dim, transformer_dim) pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32) pos_encoding = tf.expand_dims(pos_encoding, axis=0) # (1, 64, 64) x = layers.Add()([x, pos_encoding]) # (None, 64, 64) # Transformer部分:捕捉全局依赖 for i in range(num_layers): x = transformer_encoder_layer( units=transformer_dim, num_heads=num_heads, dropout=dropout, name=f"transformer_layer_{i}" )(x) # 全局平均池化获取序列级特征 x = layers.GlobalAveragePooling1D()(x) # (None, 64) x = layers.Dense(2048, activation='relu')(x) x = layers.Dropout(0.2)(x) # 输出层 - 与摩根指纹维度匹配 outputs = layers.Dense(2048, activation='relu', name="output_layer")(x) return Model(inputs=inputs, outputs=outputs, name="cnn_transformer_encoder") # 修复NT-Xent损失函数(确保形状匹配) def nt_xent_loss(y_true, y_pred, temperature=0.05): """ 修复后的NT-Xent损失函数 确保所有中间张量形状匹配,添加形状断言检查 """ # 对输入进行L2归一化 encoder_output = tf.nn.l2_normalize(y_pred, axis=1) # 形状: (batch_size, 2048) morgan_fp = tf.nn.l2_normalize(y_true, axis=1) # 形状: (batch_size, 2048) # 计算余弦相似度矩阵 (batch_size x batch_size) similarity_matrix = tf.matmul(encoder_output, morgan_fp, transpose_b=True) # 形状: (B, B) # 获取批次大小并创建正样本掩码 batch_size = tf.shape(similarity_matrix)[0] mask = tf.eye(batch_size, dtype=tf.bool) # 对角线为True的掩码,形状: (B, B) # 提取正样本对的相似度 (确保形状为1D张量) positive_similarities = tf.boolean_mask(similarity_matrix, mask) # 形状: (B,) positive_similarities = tf.reshape(positive_similarities, (batch_size,)) # 显式reshape确保形状 # 提取负样本对的相似度并重塑 negative_similarities = tf.boolean_mask(similarity_matrix, ~mask) # 形状: (B*(B-1),) # 重塑为 (batch_size, batch_size-1) negative_similarities = tf.reshape(negative_similarities, (batch_size, batch_size - 1)) # 计算分子:exp(s_ij / temperature) numerator = tf.exp(positive_similarities / temperature) # 形状: (B,) # 计算分母:sum(exp(s_ik / temperature)) for k != i denominator = tf.reduce_sum(tf.exp(negative_similarities / temperature), axis=1) # 形状: (B,) # 添加形状断言,确保分子分母形状一致 tf.debugging.assert_equal( tf.shape(numerator), tf.shape(denominator), message="NT-Xent损失函数中分子和分母形状不匹配" ) # 计算NT-Xent损失 loss = -tf.math.log(numerator / denominator) # 形状: (B,) return tf.reduce_mean(loss) # 优化的批次生成函数(添加样本数检查) def generate_batches(df, encoded_spectra, morgan_fps, groups_per_batch=2, shuffle=True): valid_smiles = [] smiles_groups = df.groupby('SMILES') for smiles, group in smiles_groups: has_pos = len(group[group['Type'] == 'Pos']) > 0 has_neg = len(group[group['Type'] == 'Neg']) > 0 if has_pos and has_neg: valid_smiles.append(smiles) if shuffle: random.shuffle(valid_smiles) for i in range(0, len(valid_smiles), groups_per_batch): batch_smiles = valid_smiles[i:i+groups_per_batch] if not batch_smiles: continue all_spectra = [] all_morgan = [] all_df = [] for smiles in batch_smiles: group = smiles_groups.get_group(smiles) group_indices = group.index.tolist() all_spectra.append(encoded_spectra[group_indices]) all_morgan.append(morgan_fps[group_indices]) all_df.append(group) batch_spectra = np.concatenate(all_spectra, axis=0) batch_morgan = np.concatenate(all_morgan, axis=0) batch_df = pd.concat(all_df, ignore_index=True) # 确保光谱和摩根指纹样本数一致 assert len(batch_spectra) == len(batch_morgan), \ f"批次样本数不匹配: 光谱={len(batch_spectra)}, 摩根指纹={len(batch_morgan)}" yield batch_spectra, batch_morgan, batch_df # 计算有效批次数 def count_valid_batches(df, groups_per_batch=2): valid_count = 0 for _, group in df.groupby('SMILES'): has_pos = len(group[group['Type'] == 'Pos']) > 0 has_neg = len(group[group['Type'] == 'Neg']) > 0 if has_pos and has_neg: valid_count += 1 return (valid_count + groups_per_batch - 1) // groups_per_batch # 计算Top1准确度(使用余弦相似度) def calculate_top1_accuracy(model, test_df, test_spectra, test_morgan, groups_per_batch=2, batch_size=32): correct = 0 total = 0 test_generator = generate_batches( test_df, test_spectra, test_morgan, groups_per_batch=groups_per_batch, shuffle=False ) test_batch_count = count_valid_batches(test_df, groups_per_batch=groups_per_batch) for _ in tqdm(range(test_batch_count), desc="计算Top1准确度批次"): batch_spectra, batch_morgan, batch_df = next(test_generator) encoder_outputs = [] for i in range(0, len(batch_spectra), batch_size): batch_s = batch_spectra[i:i+batch_size] encoder_output = model(batch_s) encoder_outputs.append(encoder_output) encoder_output = np.concatenate(encoder_outputs, axis=0) smiles_groups = batch_df.groupby('SMILES') for smiles, group in smiles_groups: group_indices = group.index.tolist() group_encoder = encoder_output[group_indices] group_morgan = batch_morgan[group_indices] pos_samples = group[group['Type'] == 'Pos'] if len(pos_samples) == 0: continue # 使用余弦相似度计算 similarities = cosine_similarity(group_encoder, group_morgan) for pos_idx in pos_samples.index: group_pos_idx = group_indices.index(pos_idx) pos_similarities = similarities[group_pos_idx] max_sim_idx = np.argmax(pos_similarities) if max_sim_idx == group_pos_idx: correct += 1 total += 1 if total == 0: return 0.0 return correct / total # 数据加载和预处理 def load_and_preprocess_data(csv_path, test_size=0.2, random_state=42): print(f"加载数据: {csv_path}") df = pd.read_csv(csv_path) # 按SMILES分组划分训练集和测试集 unique_smiles = df['SMILES'].unique() train_smiles, test_smiles = train_test_split( unique_smiles, test_size=test_size, random_state=random_state ) train_df = df[df['SMILES'].isin(train_smiles)].reset_index(drop=True) test_df = df[df['SMILES'].isin(test_smiles)].reset_index(drop=True) print(f"训练集大小: {len(train_df)}, 测试集大小: {len(test_df)}") print(f"训练集SMILES数量: {len(train_smiles)}, 测试集SMILES数量: {len(test_smiles)}") # 处理训练集光谱(启用增强)和测试集(禁用增强) print("预处理训练集光谱数据...") train_rag_tensor = prepro_specs_train(train_df, augment=True) train_encoded_spectra = encoding(train_rag_tensor, P, dimn) print(f"训练集光谱编码形状: {train_encoded_spectra.shape}") # 检查形状 print("预处理测试集光谱数据...") test_rag_tensor = prepro_specs_test(test_df) # 测试集不增强 test_encoded_spectra = encoding(test_rag_tensor, P, dimn) print(f"测试集光谱编码形状: {test_encoded_spectra.shape}") # 检查形状 # 处理Scaffold为摩根指纹 print("转换训练集Scaffold为摩根指纹...") train_df['morgan_fp'] = train_df['Scaffold'].apply(scaffold_to_morgan) train_morgan_fps = np.stack(train_df['morgan_fp'].values) print(f"训练集摩根指纹形状: {train_morgan_fps.shape}") # 检查形状 print("转换测试集Scaffold为摩根指纹...") test_df['morgan_fp'] = test_df['Scaffold'].apply(scaffold_to_morgan) test_morgan_fps = np.stack(test_df['morgan_fp'].values) print(f"测试集摩根指纹形状: {test_morgan_fps.shape}") # 检查形状 return (train_df, train_encoded_spectra, train_morgan_fps, test_df, test_encoded_spectra, test_morgan_fps) # 训练模型(添加学习率衰减) def train_model(csv_path, epochs=10, learning_rate=1e-4, test_size=0.2, groups_per_batch=2): # 加载和预处理数据 (train_df, train_spectra, train_morgan, test_df, test_spectra, test_morgan) = load_and_preprocess_data( csv_path, test_size=test_size ) # 构建CNN+Transformer模型 model = build_cnn_transformer_encoder( input_dim=dimn, cnn_filters=cnn_feature_dim, transformer_dim=transformer_dim, num_layers=1, # Transformer层数 num_heads=2, # 注意力头数 dropout=0.3 # dropout率 ) model.summary() print("\n" + "="*70) print("CNN+Transformer模型结构已确认,开始准备数据生成器并启动训练流程...") print(f"训练配置:epochs={epochs}, learning_rate={learning_rate}, test_size={test_size}, groups_per_batch={groups_per_batch}") print("="*70 + "\n") # 编译模型 - 使用修复后的NT-Xent损失函数 optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) model.compile(optimizer=optimizer, loss=nt_xent_loss) # 计算有效批次数 train_batch_count = count_valid_batches(train_df, groups_per_batch=groups_per_batch) test_batch_count = count_valid_batches(test_df, groups_per_batch=groups_per_batch) print(f"有效训练批次: {train_batch_count}, 有效测试批次: {test_batch_count}") print(f"每个批次包含 {groups_per_batch} 组SMILES样本") # 训练循环 for epoch in range(epochs): print(f"\nEpoch {epoch+1}/{epochs}") print(f"当前学习率: {model.optimizer.learning_rate.numpy():.8f}") train_total_loss = 0.0 # 每个epoch重新创建生成器(带随机打乱) train_generator = generate_batches( train_df, train_spectra, train_morgan, groups_per_batch=groups_per_batch, shuffle=True ) # 训练阶段 for _ in tqdm(range(train_batch_count), desc="训练"): batch_spectra, batch_morgan, _ = next(train_generator) # 打印批次形状用于调试 # print(f"批次光谱形状: {batch_spectra.shape}, 摩根指纹形状: {batch_morgan.shape}") loss = model.train_on_batch(batch_spectra, batch_morgan) train_total_loss += loss train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0 # 测试阶段 test_generator = generate_batches( test_df, test_spectra, test_morgan, groups_per_batch=groups_per_batch, shuffle=False ) test_total_loss = 0.0 for _ in range(test_batch_count): batch_spectra, batch_morgan, _ = next(test_generator) loss = model.test_on_batch(batch_spectra, batch_morgan) test_total_loss += loss test_avg_loss = test_total_loss / test_batch_count if test_batch_count > 0 else 0.0 # 计算测试集Top1准确度(使用余弦相似度) print("计算测试集Top1准确度...") top1_acc = calculate_top1_accuracy( model, test_df, test_spectra, test_morgan, groups_per_batch=groups_per_batch, batch_size=32 ) # 打印 epoch 结果 print(f"训练损失: {train_avg_loss:.6f}") print(f"测试损失: {test_avg_loss:.6f}") print(f"测试集Top1准确度: {top1_acc:.6f}") return model # 使用示例 if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='训练质谱数据对比学习模型(CNN+Transformer)') parser.add_argument('--data', type=str, default='/home/admin123/code/骨架128.csv', help='CSV数据文件路径') parser.add_argument('--epochs', type=int, default=5000, help='训练轮数') parser.add_argument('--lr', type=float, default=1e-4, help='初始学习率') parser.add_argument('--test_size', type=float, default=0.2, help='测试集比例') parser.add_argument('--groups_per_batch', type=int, default=8, choices=range(2, 65), help='每个批次包含的SMILES组数(2到64之间)') parser.add_argument('--output', type=str, default='cnn_transformer_encoder.h5', help='模型保存路径') args = parser.parse_args() # 训练模型 model = train_model( csv_path=args.data, epochs=args.epochs, learning_rate=args.lr, test_size=args.test_size, groups_per_batch=args.groups_per_batch ) # 保存模型 model.save(args.output) print(f"CNN+Transformer模型已保存至: {args.output}") 修改代码,解决报错Traceback (most recent call last): File "/home/admin123/code/encoder-decoder.py", line 482, in <module> model = train_model( File "/home/admin123/code/encoder-decoder.py", line 430, in train_model loss = model.train_on_batch(batch_spectra, batch_morgan) File "/home/admin123/anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 599, in train_on_batch logs = self.train_function(data()) File "/home/admin123/anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 227, in function outputs = one_step_on_data(data) File "/home/admin123/anaconda3/envs/tensorflow/lib/python3.10/site-packages/tensorflow/python/util/traceback_utils.py", line 153, in error_handler raise e.with_traceback(filtered_tb) from None File "/home/admin123/anaconda3/envs/tensorflow/lib/python3.10/site-packages/tensorflow/python/eager/execute.py", line 53, in quick_execute tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name, tensorflow.python.framework.errors_impl.InvalidArgumentError: Input to reshape is a tensor with 1048576 values, but the requested shape has 1024 Stack trace for op definition: File "code/encoder-decoder.py", line 482, in <module> File "code/encoder-decoder.py", line 430, in train_model File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 599, in train_on_batch File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 227, in function File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 113, in one_step_on_data File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 60, in train_step File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/trainers/trainer.py", line 383, in _compute_loss File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/trainers/trainer.py", line 351, in compute_loss File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/trainers/compile_utils.py", line 690, in __call__ File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/trainers/compile_utils.py", line 699, in call File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/losses/loss.py", line 67, in __call__ File "anaconda3/envs/tensorflow/lib/python3.10/site-packages/keras/src/losses/losses.py", line 33, in call File "code/encoder-decoder.py", line 216, in nt_xent_loss [[{{node compile_loss/nt_xent_loss/Reshape}}]] tf2xla conversion failed while converting __inference_one_step_on_data_62493585[_XlaMustCompile=true,config_proto=8589078909834744431,executor_type=11160318154034397263]. Run with TF_DUMP_GRAPH_PREFIX=/path/to/dump/dir and --vmodule=xla_compiler=2 to obtain a dump of the compiled functions. [Op:__inference_one_step_on_data_62493585]
08-15
评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值