import numpy as np
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import torch
import re
from sklearn.model_selection import train_test_split
CHEM_FORMULA_SIZE = "([A-Z][a-z]*)([0-9]*)"
VALID_ELEMENTS = [
"C",
"N",
"P",
"O",
"S",
"Si",
"I",
"H",
"Cl",
"F",
"Br",
"B",
"Se",
"Fe",
"Co",
"As",
"K",
"Na",
]
ELEMENT_VECTORS = np.eye(len(VALID_ELEMENTS))
element_to_position = dict(zip(VALID_ELEMENTS, ELEMENT_VECTORS))
def formula_to_dense(chem_formula: str) -> np.ndarray:
total_onehot = []
for (chem_symbol, num) in re.findall(CHEM_FORMULA_SIZE, chem_formula):
num = 1 if num == "" else int(num)
one_hot = element_to_position[chem_symbol].reshape(1, -1)
one_hot_repeats = np.repeat(one_hot, repeats=num, axis=0)
total_onehot.append(one_hot_repeats)
if len(total_onehot) == 0:
dense_vec = np.zeros(len(element_to_position))
else:
dense_vec = np.vstack(total_onehot).sum(0)
return dense_vec
def sine_embed(v, max_count=256):
num_freqs = int(np.ceil(np.log2(max_count)))
freqs = 0.5 ** torch.arange(num_freqs, dtype=torch.float32) * np.pi
v_tensor = torch.tensor(v, dtype=torch.float32)[:, None]
embedded = torch.sin(v_tensor * freqs[None, :])
return torch.abs(embedded).numpy()
def encode_formula(formula: str):
candidate_features = formula_to_dense(formula) # 将单个化学式转为特征向量
sine_embeddings = sine_embed(candidate_features)
return sine_embeddings.flatten()
def positional_encoding(max_position, d_model, min_freq=1e-6):
position = np.arange(max_position)
freqs = min_freq**(2*(np.arange(d_model)//2)/d_model)
pos_enc = position.reshape(-1,1)*freqs.reshape(1,-1)
pos_enc[:, ::2] = np.cos(pos_enc[:, ::2])
pos_enc[:, 1::2] = np.sin(pos_enc[:, 1::2])
return pos_enc
P=positional_encoding(2000000,256, min_freq=1e2)
dimn=256
def encoding(rag_tensor,P,dimn):
to_pad=[]
for sample in rag_tensor:
all_dim=[sample[0].numpy().tolist()]
pos_enc=[P[int(i)-1] for i in sample[1].numpy().tolist()]
for dim in range(dimn):
dim_n=[i[dim] for i in pos_enc]
all_dim.append(dim_n)
to_pad.append(all_dim)
to_pad=[tf.keras.preprocessing.sequence.pad_sequences(i,maxlen=501,dtype='float32',padding='post',truncating='post',value=10) for i in to_pad]
to_pad=np.stack((to_pad))
to_pad=np.swapaxes(to_pad, 1, -1)
return to_pad
def trun_n_d(n,d):
return ( n if not n.find('.')+1 else n[:n.find('.')+d+1] )
def prepro_specs_train(df):
df = df.reset_index(drop=True)
valid = []
mz_intensity = df['Spectrum'].to_list()
def process_line(line):
pairs = line.split()
mz_list = []
intensity_list = []
for pair in pairs:
mz, intensity = pair.split(':')
mz_list.append(float(mz))
intensity_list.append(float(intensity))
return mz_list, intensity_list
for idx, intensities in tqdm(enumerate(mz_intensity)):
mz_list, intensity_list = process_line(intensities)
mz_list.append(float(df.at[idx, 'Total Exact Mass']))
round_mz_list = [round(float(mz), 2) for mz in mz_list]
round_intensity_list = [round(float(intensity), 2) for intensity in intensity_list]
valid.append([round_mz_list, round_intensity_list])
return tf.ragged.constant(valid)
import json
import torch
from typing import Dict, List
from torch.utils.data import Dataset
import transformers
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader, SequentialSampler
from transformers import Trainer, TrainingArguments
from lora_plus import LoraPlusTrainer
from torch.utils.data import RandomSampler
def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]:
if target_len * 2 < cutoff_len: # truncate source
max_target_len = cutoff_len
elif source_len * 2 < cutoff_len: # truncate target
max_target_len = cutoff_len - source_len
else: # truncate both
max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
new_target_len = min(max_target_len , target_len)
max_source_len = max(cutoff_len - new_target_len, 0)
new_source_len = min(max_source_len, source_len)
return new_source_len, new_target_len
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(
self,
data_path,
tokenizer,
model_max_length,
user_tokens=[151644],
assistant_tokens=[151645],
):
super(SupervisedDataset, self).__init__()
self.data = json.load(open(data_path))
self.tokenizer = tokenizer
self.model_max_length = model_max_length
self.user_tokens = user_tokens
self.assistant_tokens = assistant_tokens
self.ignore_index = -100
# 测试第一条数据是否正确处理
item = self.preprocessing(self.data[0])
print("input:", self.tokenizer.decode(item["input_ids"]))
labels = [id_ for id_ in item["labels"] if id_ != -100] # 过滤 -100 的标签
def __len__(self):
return len(self.data)
def preprocessing(self, example):
input_ids = []
labels = []
# 将用户和助手的内容配对
messages = example["conversations"]
pairs = []
current_user_encoded = None
# 将 user 和 assistant 配对,并将其打包成编码后的 pairs
for message in messages:
if message["role"] == "user":
# 编码用户消息
current_user_encoded = [self.tokenizer.bos_token_id] + self.user_tokens + self.tokenizer.encode(
message["content"], add_special_tokens=False
)
elif message["role"] == "assistant" and current_user_encoded is not None:
# 编码助手消息
assistant_encoded = self.assistant_tokens + self.tokenizer.encode(
message["content"], add_special_tokens=False
)
# 配对形成一个 (source_ids, target_ids)
pairs.append((current_user_encoded, assistant_encoded))
current_user_encoded = None
total_length = 0 # 初始化总长度
# 逐对处理编码后的 (source_ids, target_ids)
for turn_idx, (source_ids, target_ids) in enumerate(pairs):
# 检查是否超出最大长度,若超出则停止处理
if total_length >= self.model_max_length:
print("Exceeded max length, stopping processing further turns.")
break
# 动态截断长度
source_len, target_len = infer_seqlen(
len(source_ids), len(target_ids), self.model_max_length - total_length
)
source_ids = source_ids[:source_len]
target_ids = target_ids[:target_len]
# 更新总长度
total_length += source_len + target_len
source_label = [self.tokenizer.bos_token_id] + [self.ignore_index] * (source_len-1)
target_label = target_ids
# 数据拼接
input_ids += source_ids + target_ids
labels += source_label + target_label
# 添加 EOS 标记
input_ids += [self.tokenizer.eos_token_id]
labels += [self.tokenizer.eos_token_id]
input_ids += [self.tokenizer.pad_token_id] * (
self.model_max_length - len(input_ids)
)
labels += [self.ignore_index] * (self.model_max_length - len(labels))
# 转换为 Tensor
input_ids = torch.LongTensor(input_ids)
labels = torch.LongTensor(labels)
# 构造 attention_mask
attention_mask = attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
return self.preprocessing(self.data[idx])修改代码,改为读取csv文件,提取csv文件的Molecular Formula、Total Exact Mass、Spectrum以及SELFIES列数据,对Molecular Formula使用formula_to_dense函数获取形状为(batch,18)的输入,对Total Exact Mass和Spectrum列使用prepro_specs_train和encoding函数获取形状为(batch,501,257)的输入,然后SELFIES使用tokenrizer编码,使用cls_token和sep_token作为开始和结束标记,填充标记为pad_token
最新发布