import datasets
import transformers
import modelscope
from itertools import chain
import glob
import torch
import evaluate
from swanlab.integration.transformers import SwanLabCallback
import swanlab
import numpy as np
from sklearn.metrics import accuracy_score
import evaluate
import transformers
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, GPT2Config
from transformers import RobertaTokenizerFast
# 实现旋转位置编码
class RotaryPositionalEmbedding(nn.Module):
def __init__(self, dim, max_seq_len=2048, base=10000):
super().__init__()
self.dim = dim
self.max_seq_len = max_seq_len
self.base = base
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer('inv_freq', inv_freq)
self.register_buffer('positions', torch.arange(max_seq_len))
def forward(self, x):
batch_size, seq_len, hidden_size = x.size()
seq_len = min(seq_len, self.max_seq_len)
positions = self.positions[:seq_len]
angles = positions.unsqueeze(1) * self.inv_freq.unsqueeze(0)
angles = angles.unsqueeze(0).repeat(batch_size, 1, 1)
cos_angles = torch.cos(angles)
sin_angles = torch.sin(angles)
x_reshaped = x.view(batch_size, seq_len, -1, 2)
x1, x2 = x_reshaped[..., 0], x_reshaped[..., 1]
rotated_x1 = x1 * cos_angles - x2 * sin_angles
rotated_x2 = x1 * sin_angles + x2 * cos_angles
rotated = torch.stack([rotated_x1, rotated_x2], dim=-1).view(batch_size, seq_len, hidden_size)
return rotated
# 自定义模型类,继承自原模型的PreTrainedModel
class CustomModelWithRotary(transformers.PreTrainedModel):
config_class = GPT2Config
def __init__(self, config):
super().__init__(config)
# 移除 trust_remote_code 参数
self.base_model = transformers.AutoModelForCausalLM.from_pretrained(
"/root/workspace/workspace/gpt2-large",
config=config
)
self.rotary_emb = RotaryPositionalEmbedding(
dim=config.hidden_size,
max_seq_len=config.max_position_embeddings
)
self.config.position_encoding_type = "rotary"
def forward(self, input_ids=None, attention_mask=None, **kwargs):
outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
if hasattr(outputs, 'last_hidden_state'):
outputs.last_hidden_state = self.rotary_emb(outputs.last_hidden_state)
else:
# 处理返回元组的情况
outputs = (self.rotary_emb(outputs[0]),) + outputs[1:]
return outputs
def main():
swanlab.init("PreTrain-GPT2-SELFIES")
swanlab_callback = SwanLabCallback(
project="PreTrain-GPT2-SELFIES",
experiment_name="PreTrain-GPT2-SELFIES"
)
raw_datasets = datasets.load_dataset(
"json", data_files="/root/workspace/selfies1.json"
)
# split dataset
raw_datasets = raw_datasets["train"].train_test_split(test_size=0.05, seed=2333)
print("dataset info")
print(raw_datasets)
saved_path = '/root/workspace/robertatokenizer'
tokenizer = RobertaTokenizerFast.from_pretrained(saved_path)
print("Loaded RobertaTokenizerFast from: " + saved_path)
# 设置分词器的最大长度
tokenizer.model_max_length = 512
print(f"Set tokenizer model_max_length to: {512}")
context_length = 512 # use a small context length)
vocab_size = tokenizer.vocab_size
print(f"Tokenizer vocab size: {vocab_size}")
# preprocess dataset
def tokenize(element):
# 对数据集进行预处理,将文本转换为模型可以处理的输入格式
# 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式
# truncation=True表示如果文本长度超过了context_length,就截断
# max_length=context_length表示文本的最大长度为context_length
# return_overflowing_tokens=True表示返回溢出的tokens
outputs = tokenizer(
element["text"],
truncation=True,
max_length=context_length,
return_overflowing_tokens=True,
return_length=True,
)
input_batch = []
# 作用是将溢出的tokens转换为模型可以处理的输入格式
# 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式
# 这里的input_ids是一个二维数组,每一行表示一个文本的输入格式
# 这里的length是一个一维数组,每一个元素表示一个文本的长度
# 这里的input_batch是一个二维数组,每一行表示一个文本的输入格式
# 这里的context_length是一个整数,表示文本的最大长度
for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
if length == context_length:
input_batch.append(input_ids)
return {"input_ids": input_batch}
# map函数的作用是将tokenize函数应用到数据集的每一个元素上
# batched=True表示将数据集分成batch进行处理
# remove_columns=raw_datasets["train"].column_names表示删除原始数据集的列名
tokenized_datasets = raw_datasets.map(
tokenize, batched=True,num_proc=20, remove_columns=raw_datasets["train"].column_names
)
print("tokenize dataset info")
#
print(tokenized_datasets)
# eos_token的作用是表示文本的结束
# pad_token的作用是表示填充的token
tokenizer.pad_token = tokenizer.eos_token
# DataCollatorForLanguageModeling的作用是将数据集转换为模型可以处理的输入格式
# 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式
# mlm=False表示不进行masked language modeling
data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
# 加载配置
config = GPT2Config.from_pretrained("/root/workspace/workspace/gpt2-large")
config.vocab_size = vocab_size # 更新配置中的 vocab_size
# 检查配置中的 vocab_size
print(f"Model config vocab size: {config.vocab_size}")
# 直接实例化自定义模型,无需注册
model = CustomModelWithRotary(config)
model_size = sum(t.numel() for t in model.parameters())
print("Model Config:")
print(config)
print(f"Model Size: {model_size/1000**2:.1f}M parameters")
# 加载各个所需的指标
accuracy_metric = evaluate.load('./metrics/accuracy')
def compute_metrics(eval_preds):
logits, labels = eval_preds
# 获取预测结果(取logits中概率最大的索引)
preds = np.argmax(logits, axis=-1) # 形状: [batch_size, sequence_length]
labels = labels[:, 1:].reshape(-1)
preds = preds[:, :-1].reshape(-1)
# 计算每个标记的准确度
accuracy = accuracy_metric.compute(predictions=preds, references=labels)
return accuracy
import random
def compute_metrics_partial(eval_preds, subset_ratio=0.5):
# 这里假设 eval_preds 是一个 list,包含 logits 和 labels
logits, labels = eval_preds
# 随机选择部分批次进行计算
batch_size = logits.shape[0]
subset_size = int(batch_size * subset_ratio) # 计算子集的大小
selected_indices = random.sample(range(batch_size), subset_size)
# 获取预测结果(只对选定的批次进行计算)
selected_logits = logits[selected_indices]
selected_labels = labels[selected_indices]
preds = np.argmax(selected_logits, axis=-1) # shape: [subset_size, sequence_length]
selected_labels = selected_labels[:, 1:].reshape(-1)
selected_preds = preds[:, :-1].reshape(-1)
# 计算准确度
accuracy = accuracy_metric.compute(predictions=selected_preds, references=selected_labels)
return accuracy
# train
args = transformers.TrainingArguments(
output_dir="./GPT2-SELFIES",
per_device_train_batch_size=3, # 每个GPU的训练batch数
per_device_eval_batch_size=3, # 每个GPU的测试batch数
eval_strategy="steps",
eval_steps=2,
logging_steps=5,
gradient_accumulation_steps=8, # 梯度累计总数
num_train_epochs=100, # 训练epoch数
lr_scheduler_type="cosine", # 学习率衰减策略
learning_rate=1e-5, # 基础学习率,
save_steps=500,
save_total_limit=10,
bf16=True, # 开启bf16训练, 对于Amper架构以下的显卡建议替换为fp16=True
)
print("Train Args:")
print(args)
# enjoy training
trainer = transformers.Trainer(
model=model,
tokenizer=tokenizer,
args=args,
data_collator=data_collator,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["test"],
compute_metrics=compute_metrics,
callbacks=[swanlab_callback],
)
trainer.train()
# save model
trainer.save_model("./GPT2-SELFIES/Weight") # 保存模型的路径
# generate
pipe = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer)
print("GENERATE:", pipe("人工智能", num_return_sequences=1)[0]["generated_text"])
prompts = ["牛顿", "北京市", "亚洲历史"]
examples = []
for i in range(3):
# 根据提示词生成数据
text = pipe(prompts[i], num_return_sequences=1)[0]["generated_text"]
text = swanlab.Text(text)
examples.append(text)
swanlab.log({"Generate": examples})
if __name__ == "__main__":
main()解决报错 File "/root/workspace/gpt2-large.py", line 245, in <module>
main()
File "/root/workspace/gpt2-large.py", line 148, in main
model = CustomModelWithRotary(config)
File "/root/workspace/gpt2-large.py", line 51, in __init__
self.base_model = transformers.AutoModelForCausalLM.from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained
return model_class.from_pretrained(
File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 262, in _wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4319, in from_pretrained
) = cls._load_pretrained_model(
File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4955, in _load_pretrained_model
raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
Error(s) in loading state_dict for GPT2LMHeadModel:
size mismatch for wte.weight: copying a param with shape torch.Size([50257, 1280]) from checkpoint, the shape in current model is torch.Size([3289, 1280]).
You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.使用全新词汇表
最新发布