import numpy as np
import tensorflow as tf
import pandas as pd
import random
import argparse
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from rdkit import Chem
from rdkit.Chem import AllChem
from tqdm import tqdm
import tensorflow.keras.backend as K
from tensorflow.keras import layers, Model
# 配置GPU内存动态分配
gpus = tf.config.list_physical_devices('GPU')
if gpus:
try:
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.list_logical_devices('GPU')
print(f"{len(gpus)} 物理GPU, {len(logical_gpus)} 逻辑GPU")
except RuntimeError as e:
print(e)
# 补充必要的工具函数
def positional_encoding(max_position, d_model, min_freq=1e-6):
position = np.arange(max_position)
freqs = min_freq **(2 * (np.arange(d_model) // 2) / d_model)
pos_enc = position.reshape(-1, 1) * freqs.reshape(1, -1)
pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])
pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])
return pos_enc.astype(np.float32)
# 初始化位置编码参数
dimn = 64 # 光谱编码维度
cnn_feature_dim = 64 # CNN输出特征维度
transformer_dim = 64 # Transformer特征维度
P = positional_encoding(256, transformer_dim, min_freq=1e2)
# 光谱数据增强
def augment_spectrum(mz_list, intensity_list, noise_factor=0.01):
noisy_mz = [mz + np.random.normal(0, noise_factor) for mz in mz_list]
noisy_intensity = [intensity * (1 + np.random.normal(0, noise_factor)) for intensity in intensity_list]
noisy_intensity = [max(0, i) for i in noisy_intensity]
return noisy_mz, noisy_intensity
def prepro_specs_train(df, augment=True):
df = df.reset_index(drop=True)
valid = []
mz_intensity = df['Spectrum'].to_list()
def process_line(line):
pairs = line.split()
mz_list = []
intensity_list = []
for pair in pairs:
mz, intensity = pair.split(':')
mz_list.append(float(mz))
intensity_list.append(float(intensity))
return mz_list, intensity_list
for idx, intensities in tqdm(enumerate(mz_intensity), disable=False, desc="预处理光谱数据"):
mz_list, intensity_list = process_line(intensities)
mz_list.append(float(df.at[idx, 'Total Exact Mass']))
if augment:
mz_list, intensity_list = augment_spectrum(mz_list, intensity_list)
round_mz_list = [round(float(mz), 2) for mz in mz_list]
round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list]
valid.append([round_mz_list, round_intensity_list])
return tf.ragged.constant(valid)
def prepro_specs_test(df):
return prepro_specs_train(df, augment=False)
def encoding(rag_tensor, P, dimn):
to_pad = []
for sample in tqdm(rag_tensor, desc="编码光谱数据"):
mz_list = sample[0].numpy().tolist()
intensity_list = sample[1].numpy().tolist()
positions = [min(int(round(intensity * 100)), len(P)-1) for intensity in intensity_list]
pos_enc = np.array([P[pos] for pos in positions]) if positions else np.zeros((1, dimn))
averaged_encoding = [np.mean(pos_enc[:, dim]) for dim in range(dimn)]
to_pad.append(averaged_encoding)
return np.array(to_pad, dtype=np.float32)
# 将Scaffold转换为摩根指纹
def scaffold_to_morgan(smiles, radius=2, nBits=2048):
if pd.isna(smiles) or smiles == '':
return np.zeros(nBits, dtype=np.float32)
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return np.zeros(nBits, dtype=np.float32)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
return np.array(fp, dtype=np.float32)
# 自定义Transformer编码器层
def transformer_encoder_layer(units, num_heads, dropout, name="transformer_encoder_layer"):
inputs = layers.Input(shape=(None, units), name="inputs")
attention = layers.MultiHeadAttention(
num_heads=num_heads, key_dim=units//num_heads, name="attention"
)(inputs, inputs)
attention = layers.Dropout(dropout)(attention)
attention = layers.Add()([inputs, attention])
attention = layers.LayerNormalization(epsilon=1e-6)(attention)
ffn = layers.Dense(units * 2, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(attention)
ffn = layers.Dense(units, activation="gelu", kernel_regularizer=tf.keras.regularizers.l2(1e-4))(ffn)
ffn = layers.Dropout(dropout)(ffn)
outputs = layers.Add()([attention, ffn])
outputs = layers.LayerNormalization(epsilon=1e-6)(outputs)
return Model(inputs=inputs, outputs=outputs, name=name)
# 构建CNN+Transformer模型
def build_cnn_transformer_encoder(input_dim=dimn, cnn_filters=cnn_feature_dim,
transformer_dim=transformer_dim, num_layers=2,
num_heads=2, dropout=0.3):
inputs = layers.Input(shape=(input_dim,), name="input_layer")
x = layers.Reshape((input_dim, 1))(inputs)
x = layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误
x = layers.Conv1D(filters=cnn_filters, kernel_size=5, padding='same', activation='gelu',
kernel_regularizer=tf.keras.regularizers.l2(1e-4))(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(dropout/2)(x) # 修复此处的类型错误
if cnn_filters != transformer_dim:
x = layers.Conv1D(filters=transformer_dim, kernel_size=1, padding='same')(x)
pos_encoding = positional_encoding(input_dim, transformer_dim)
pos_encoding = tf.convert_to_tensor(pos_encoding, dtype=tf.float32)
pos_encoding = tf.expand_dims(pos_encoding, axis=0)
x = layers.Add()([x, pos_encoding])
for i in range(num_layers):
x = transformer_encoder_layer(
units=transformer_dim,
num_heads=num_heads,
dropout=dropout,
name=f"transformer_layer_{i}"
)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(2048, activation='relu')(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(2048, activation='relu', name="output_layer")(x)
return Model(inputs=inputs, outputs=outputs, name="cnn_transformer_encoder")
# NT-Xent损失函数
def nt_xent_loss(y_true, y_pred, temperature=0.05):
encoder_output = tf.nn.l2_normalize(y_pred, axis=1)
morgan_fp = tf.nn.l2_normalize(y_true, axis=1)
samples_per_group = 256
batch_size = tf.shape(encoder_output)[0]
num_groups = batch_size // samples_per_group
encoder_grouped = tf.reshape(encoder_output, (num_groups, samples_per_group, -1))
morgan_grouped = tf.reshape(morgan_fp, (num_groups, samples_per_group, -1))
similarity_matrix = tf.matmul(encoder_grouped, morgan_grouped, transpose_b=True)
positive_similarity = similarity_matrix[:, 0, 0]
base_mask = tf.logical_not(tf.eye(samples_per_group, dtype=tf.bool))
mask = tf.tile(tf.expand_dims(base_mask, 0), [num_groups, 1, 1])
group_similarities = similarity_matrix[:, 0, :]
group_mask = mask[:, 0, :]
negative_similarities = tf.boolean_mask(group_similarities, group_mask)
numerator = tf.exp(positive_similarity / temperature)
denominator = tf.reduce_sum(tf.exp(negative_similarities / temperature), axis=0)
per_group_loss = -tf.math.log(numerator / (denominator + numerator))
return tf.reduce_mean(per_group_loss)
# 批次生成函数
def generate_batches(df, encoded_spectra, morgan_fps, groups_per_batch=2, shuffle=True):
valid_smiles = []
smiles_groups = df.groupby('SMILES')
for smiles, group in smiles_groups:
pos_count = len(group[group['Type'] == 'Pos'])
neg_count = len(group[group['Type'] == 'Neg'])
if pos_count == 1 and neg_count == 255:
valid_smiles.append(smiles)
if shuffle:
random.shuffle(valid_smiles)
for i in range(0, len(valid_smiles), groups_per_batch):
batch_smiles = valid_smiles[i:i+groups_per_batch]
if not batch_smiles:
continue
all_spectra = []
all_morgan = []
all_df = []
for smiles in batch_smiles:
group = smiles_groups.get_group(smiles)
pos_samples = group[group['Type'] == 'Pos']
neg_samples = group[group['Type'] == 'Neg']
ordered_group = pd.concat([pos_samples, neg_samples])
group_indices = ordered_group.index.tolist()
all_spectra.append(encoded_spectra[group_indices])
all_morgan.append(morgan_fps[group_indices])
all_df.append(ordered_group)
batch_spectra = np.concatenate(all_spectra, axis=0)
batch_morgan = np.concatenate(all_morgan, axis=0)
batch_df = pd.concat(all_df, ignore_index=True)
yield batch_spectra, batch_morgan, batch_df
# 计算有效批次数
def count_valid_batches(df, groups_per_batch=2):
valid_count = 0
for _, group in df.groupby('SMILES'):
pos_count = len(group[group['Type'] == 'Pos'])
neg_count = len(group[group['Type'] == 'Neg'])
if pos_count == 1 and neg_count == 255:
valid_count += 1
return (valid_count + groups_per_batch - 1) // groups_per_batch
# 计算Top1准确度
def calculate_top1_accuracy(model, test_df, test_spectra, test_morgan, groups_per_batch=2):
correct = 0
total = 0
test_generator = generate_batches(
test_df, test_spectra, test_morgan,
groups_per_batch=groups_per_batch, shuffle=False
)
test_batch_count = count_valid_batches(test_df, groups_per_batch=groups_per_batch)
for _ in tqdm(range(test_batch_count), desc="计算Top1准确度"):
batch_spectra, batch_morgan, _ = next(test_generator)
encoder_output = model(batch_spectra)
samples_per_group = 256
num_groups = len(batch_spectra) // samples_per_group
for group_idx in range(num_groups):
start_idx = group_idx * samples_per_group
end_idx = start_idx + samples_per_group
group_encoder = encoder_output[start_idx:end_idx]
group_morgan = batch_morgan[start_idx:end_idx]
similarities = cosine_similarity(group_encoder, group_morgan)
pos_idx = 0
pos_similarities = similarities[pos_idx]
max_sim_idx = np.argmax(pos_similarities)
if max_sim_idx == pos_idx:
correct += 1
total += 1
if total == 0:
return 0.0
return correct / total
# 数据加载和预处理
def load_and_preprocess_data(csv_path, test_size=0.2, random_state=46):
print(f"加载数据: {csv_path}")
df = pd.read_csv(csv_path)
valid_smiles = []
smiles_groups = df.groupby('SMILES')
for smiles, group in smiles_groups:
pos_count = len(group[group['Type'] == 'Pos'])
neg_count = len(group[group['Type'] == 'Neg'])
if pos_count == 1 and neg_count == 255:
valid_smiles.append(smiles)
print(f"有效SMILES组数: {len(valid_smiles)}")
# 先划分出独立测试集
train_val_smiles, test_smiles = train_test_split(
valid_smiles, test_size=test_size, random_state=random_state
)
train_val_df = df[df['SMILES'].isin(train_val_smiles)].reset_index(drop=True)
test_df = df[df['SMILES'].isin(test_smiles)].reset_index(drop=True)
print(f"训练验证集大小: {len(train_val_df)}, 独立测试集大小: {len(test_df)}")
print(f"训练验证SMILES数量: {len(train_val_smiles)}, 测试SMILES数量: {len(test_smiles)}")
# 处理测试集
print("预处理测试集光谱数据...")
test_rag_tensor = prepro_specs_test(test_df)
test_encoded_spectra = encoding(test_rag_tensor, P, dimn)
print("转换测试集Scaffold为摩根指纹...")
test_df['morgan_fp'] = test_df['Scaffold'].apply(scaffold_to_morgan)
test_morgan_fps = np.stack(test_df['morgan_fp'].values)
return train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps
# 交叉验证训练函数
def cross_validate_model(train_val_df, train_val_smiles, hyperparams, n_splits=10, epochs=10):
"""执行十折交叉验证评估给定超参数"""
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
fold_results = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train_val_smiles)):
print(f"\n{'='*50}")
print(f"开始第 {fold+1}/{n_splits} 折交叉验证")
print(f"{'='*50}\n")
# 划分当前折的训练集和验证集
train_smiles = [train_val_smiles[i] for i in train_idx]
val_smiles = [train_val_smiles[i] for i in val_idx]
fold_train_df = train_val_df[train_val_df['SMILES'].isin(train_smiles)].reset_index(drop=True)
fold_val_df = train_val_df[train_val_df['SMILES'].isin(val_smiles)].reset_index(drop=True)
# 预处理光谱数据
print("预处理当前折训练集光谱数据...")
train_rag_tensor = prepro_specs_train(fold_train_df, augment=True)
train_encoded_spectra = encoding(train_rag_tensor, P, dimn)
print("预处理当前折验证集光谱数据...")
val_rag_tensor = prepro_specs_test(fold_val_df)
val_encoded_spectra = encoding(val_rag_tensor, P, dimn)
# 处理摩根指纹
print("转换当前折训练集Scaffold为摩根指纹...")
fold_train_df['morgan_fp'] = fold_train_df['Scaffold'].apply(scaffold_to_morgan)
train_morgan_fps = np.stack(fold_train_df['morgan_fp'].values)
print("转换当前折验证集Scaffold为摩根指纹...")
fold_val_df['morgan_fp'] = fold_val_df['Scaffold'].apply(scaffold_to_morgan)
val_morgan_fps = np.stack(fold_val_df['morgan_fp'].values)
# 构建模型
model = build_cnn_transformer_encoder(
input_dim=dimn,
cnn_filters=hyperparams['cnn_filters'],
transformer_dim=hyperparams['transformer_dim'],
num_layers=hyperparams['num_layers'],
num_heads=hyperparams['num_heads'],
dropout=hyperparams['dropout']
)
# 编译模型
optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate'])
model.compile(optimizer=optimizer, loss=nt_xent_loss)
# 计算批次数
train_batch_count = count_valid_batches(fold_train_df, groups_per_batch=hyperparams['groups_per_batch'])
val_batch_count = count_valid_batches(fold_val_df, groups_per_batch=hyperparams['groups_per_batch'])
print(f"当前折训练批次数: {train_batch_count}, 验证批次数: {val_batch_count}")
# 训练模型
best_val_acc = 0.0
for epoch in range(epochs):
print(f"\n第 {epoch+1}/{epochs} 轮")
train_total_loss = 0.0
train_generator = generate_batches(
fold_train_df, train_encoded_spectra, train_morgan_fps,
groups_per_batch=hyperparams['groups_per_batch'], shuffle=True
)
# 训练
for _ in tqdm(range(train_batch_count), desc="训练"):
batch_spectra, batch_morgan, _ = next(train_generator)
loss = model.train_on_batch(batch_spectra, batch_morgan)
train_total_loss += loss
train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0
# 验证
val_generator = generate_batches(
fold_val_df, val_encoded_spectra, val_morgan_fps,
groups_per_batch=hyperparams['groups_per_batch'], shuffle=False
)
val_total_loss = 0.0
for _ in range(val_batch_count):
batch_spectra, batch_morgan, _ = next(val_generator)
loss = model.test_on_batch(batch_spectra, batch_morgan)
val_total_loss += loss
val_avg_loss = val_total_loss / val_batch_count if val_batch_count > 0 else 0.0
# 计算验证集准确度
val_acc = calculate_top1_accuracy(
model, fold_val_df, val_encoded_spectra, val_morgan_fps,
groups_per_batch=hyperparams['groups_per_batch']
)
print(f"训练损失: {train_avg_loss:.6f}, 验证损失: {val_avg_loss:.6f}, 验证准确度: {val_acc:.6f}")
# 保存当前折中最佳模型
if val_acc > best_val_acc:
best_val_acc = val_acc
fold_results.append(best_val_acc)
print(f"\n第 {fold+1} 折最佳验证准确度: {best_val_acc:.6f}")
# 清除内存
K.clear_session()
# 计算交叉验证结果
mean_acc = np.mean(fold_results)
std_acc = np.std(fold_results)
print(f"\n{'='*50}")
print(f"十折交叉验证结果: {mean_acc:.6f} ± {std_acc:.6f}")
print(f"{'='*50}\n")
return mean_acc, std_acc, fold_results
# 在完整训练集上训练最终模型
def train_final_model(train_val_df, hyperparams, epochs=10):
print("\n开始在完整训练集上训练最终模型...")
# 预处理所有训练数据
print("预处理完整训练集光谱数据...")
train_rag_tensor = prepro_specs_train(train_val_df, augment=True)
train_encoded_spectra = encoding(train_rag_tensor, P, dimn)
print("转换完整训练集Scaffold为摩根指纹...")
train_val_df['morgan_fp'] = train_val_df['Scaffold'].apply(scaffold_to_morgan)
train_morgan_fps = np.stack(train_val_df['morgan_fp'].values)
# 构建模型
model = build_cnn_transformer_encoder(
input_dim=dimn,
cnn_filters=hyperparams['cnn_filters'],
transformer_dim=hyperparams['transformer_dim'],
num_layers=hyperparams['num_layers'],
num_heads=hyperparams['num_heads'],
dropout=hyperparams['dropout']
)
# 编译模型
optimizer = tf.keras.optimizers.Adam(learning_rate=hyperparams['learning_rate'])
model.compile(optimizer=optimizer, loss=nt_xent_loss)
# 计算批次数
train_batch_count = count_valid_batches(train_val_df, groups_per_batch=hyperparams['groups_per_batch'])
print(f"完整训练集批次数: {train_batch_count}")
# 训练模型
for epoch in range(epochs):
print(f"\n第 {epoch+1}/{epochs} 轮")
train_total_loss = 0.0
train_generator = generate_batches(
train_val_df, train_encoded_spectra, train_morgan_fps,
groups_per_batch=hyperparams['groups_per_batch'], shuffle=True
)
# 训练
for _ in tqdm(range(train_batch_count), desc="训练"):
batch_spectra, batch_morgan, _ = next(train_generator)
loss = model.train_on_batch(batch_spectra, batch_morgan)
train_total_loss += loss
train_avg_loss = train_total_loss / train_batch_count if train_batch_count > 0 else 0.0
print(f"训练损失: {train_avg_loss:.6f}")
return model
# 主函数
def main(args):
# 加载数据并划分训练验证集和独立测试集
train_val_df, train_val_smiles, test_df, test_encoded_spectra, test_morgan_fps = load_and_preprocess_data(
args.data, test_size=args.test_size
)
# 定义超参数搜索空间 - 修复了参数格式,每个字典代表一组完整的超参数组合
param_grid = [
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 2,
'dropout': 0.3,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 2,
'dropout': 0.3,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 2,
'dropout': 0.4,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 2,
'dropout': 0.4,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 4,
'dropout': 0.3,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 4,
'dropout': 0.3,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 4,
'dropout': 0.4,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 1,
'num_heads': 4,
'dropout': 0.4,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 2,
'dropout': 0.3,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 2,
'dropout': 0.3,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 2,
'dropout': 0.4,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 2,
'dropout': 0.4,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 4,
'dropout': 0.3,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 4,
'dropout': 0.3,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 4,
'dropout': 0.4,
'learning_rate': 1e-4,
'groups_per_batch': args.groups_per_batch
},
{
'cnn_filters': 64,
'transformer_dim': 64,
'num_layers': 2,
'num_heads': 4,
'dropout': 0.4,
'learning_rate': 5e-5,
'groups_per_batch': args.groups_per_batch
}
]
# 超参数优化
best_score = -1
best_params = None
results = []
print("\n开始超参数搜索和十折交叉验证...")
for i, params in enumerate(param_grid):
print(f"\n{'#'*50}")
print(f"测试第 {i+1}/{len(param_grid)} 组超参数: {params}")
print(f"{'#'*50}\n")
mean_acc, std_acc, fold_results = cross_validate_model(
train_val_df, train_val_smiles,
params,
n_splits=10,
epochs=args.epochs
)
results.append({
'params': params,
'mean_acc': mean_acc,
'std_acc': std_acc,
'fold_results': fold_results
})
if mean_acc > best_score:
best_score = mean_acc
best_params = params
# 输出最佳超参数
print("\n" + "="*70)
print("超参数搜索完成!")
print(f"最佳超参数: {best_params}")
print(f"最佳交叉验证准确度: {best_score:.6f}")
print("="*70 + "\n")
# 使用最佳超参数训练最终模型
final_model = train_final_model(
train_val_df,
best_params,
epochs=args.epochs
)
# 在独立测试集上评估
print("\n在独立测试集上评估最终模型...")
test_acc = calculate_top1_accuracy(
final_model, test_df, test_encoded_spectra, test_morgan_fps,
groups_per_batch=best_params['groups_per_batch']
)
print(f"\n{'='*70}")
print(f"独立测试集Top1准确度: {test_acc:.6f}")
print("="*70 + "\n")
# 保存模型
final_model.save(args.output)
print(f"最终模型已保存至: {args.output}")
# 保存超参数搜索结果
import json
with open('hyperparameter_results.json', 'w') as f:
json.dump(results, f, indent=2, default=lambda x: x.tolist() if isinstance(x, np.ndarray) else x)
print("超参数搜索结果已保存至 hyperparameter_results.json")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='带十折交叉验证的质谱数据对比学习模型(CNN+Transformer)')
parser.add_argument('--data', type=str, default='/home/admin123/code/骨架256.csv', help='CSV数据文件路径')
parser.add_argument('--epochs', type=int, default=1000, help='训练轮数')
parser.add_argument('--test_size', type=float, default=0.2, help='独立测试集比例')
parser.add_argument('--groups_per_batch', type=int, default=8,
choices=range(1, 65),
help='每个批次包含的SMILES组数(1到64之间)')
parser.add_argument('--output', type=str, default='best_cnn_transformer_encoder.h5', help='最终模型保存路径')
args = parser.parse_args()
main(args)修改代码确保计算损失时提取到的正样本是来自group['Type'] == 'Pos'