设置编码格式Context Type和Workspace

本文介绍在导入项目到Eclipse前如何正确地设置Eclipse的编码格式为UTF-8,确保项目文件正常显示。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

工程导入eclipse之前先设置eclipse的编码格式为”UTF-8”,以下两个地方都要设置:





import datasets import transformers import modelscope from itertools import chain import glob import torch import evaluate from swanlab.integration.transformers import SwanLabCallback import swanlab import numpy as np from sklearn.metrics import accuracy_score import evaluate import transformers import torch import torch.nn as nn from transformers import AutoModelForCausalLM, GPT2Config from transformers import RobertaTokenizerFast # 实现旋转位置编码 class RotaryPositionalEmbedding(nn.Module): def __init__(self, dim, max_seq_len=2048, base=10000): super().__init__() self.dim = dim self.max_seq_len = max_seq_len self.base = base inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) self.register_buffer('inv_freq', inv_freq) self.register_buffer('positions', torch.arange(max_seq_len)) def forward(self, x): batch_size, seq_len, hidden_size = x.size() seq_len = min(seq_len, self.max_seq_len) positions = self.positions[:seq_len] angles = positions.unsqueeze(1) * self.inv_freq.unsqueeze(0) angles = angles.unsqueeze(0).repeat(batch_size, 1, 1) cos_angles = torch.cos(angles) sin_angles = torch.sin(angles) x_reshaped = x.view(batch_size, seq_len, -1, 2) x1, x2 = x_reshaped[..., 0], x_reshaped[..., 1] rotated_x1 = x1 * cos_angles - x2 * sin_angles rotated_x2 = x1 * sin_angles + x2 * cos_angles rotated = torch.stack([rotated_x1, rotated_x2], dim=-1).view(batch_size, seq_len, hidden_size) return rotated # 自定义模型类,继承自原模型的PreTrainedModel class CustomModelWithRotary(transformers.PreTrainedModel): config_class = GPT2Config def __init__(self, config): super().__init__(config) # 移除 trust_remote_code 参数 self.base_model = transformers.AutoModelForCausalLM.from_pretrained( "/root/workspace/workspace/gpt2-large", config=config ) self.rotary_emb = RotaryPositionalEmbedding( dim=config.hidden_size, max_seq_len=config.max_position_embeddings ) self.config.position_encoding_type = "rotary" def forward(self, input_ids=None, attention_mask=None, **kwargs): outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs) if hasattr(outputs, 'last_hidden_state'): outputs.last_hidden_state = self.rotary_emb(outputs.last_hidden_state) else: # 处理返回元组的情况 outputs = (self.rotary_emb(outputs[0]),) + outputs[1:] return outputs def main(): swanlab.init("PreTrain-GPT2-SELFIES") swanlab_callback = SwanLabCallback( project="PreTrain-GPT2-SELFIES", experiment_name="PreTrain-GPT2-SELFIES" ) raw_datasets = datasets.load_dataset( "json", data_files="/root/workspace/selfies1.json" ) # split dataset raw_datasets = raw_datasets["train"].train_test_split(test_size=0.05, seed=2333) print("dataset info") print(raw_datasets) saved_path = '/root/workspace/robertatokenizer' tokenizer = RobertaTokenizerFast.from_pretrained(saved_path) print("Loaded RobertaTokenizerFast from: " + saved_path) # 设置分词器的最大长度 tokenizer.model_max_length = 512 print(f"Set tokenizer model_max_length to: {512}") context_length = 512 # use a small context length) vocab_size = tokenizer.vocab_size print(f"Tokenizer vocab size: {vocab_size}") # preprocess dataset def tokenize(element): # 对数据集进行预处理,将文本转换为模型可以处理的输入格式 # 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式 # truncation=True表示如果文本长度超过了context_length,就截断 # max_length=context_length表示文本的最大长度为context_length # return_overflowing_tokens=True表示返回溢出的tokens outputs = tokenizer( element["text"], truncation=True, max_length=context_length, return_overflowing_tokens=True, return_length=True, ) input_batch = [] # 作用是将溢出的tokens转换为模型可以处理的输入格式 # 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式 # 这里的input_ids是一个二维数组,每一行表示一个文本的输入格式 # 这里的length是一个一维数组,每一个元素表示一个文本的长度 # 这里的input_batch是一个二维数组,每一行表示一个文本的输入格式 # 这里的context_length是一个整数,表示文本的最大长度 for length, input_ids in zip(outputs["length"], outputs["input_ids"]): if length == context_length: input_batch.append(input_ids) return {"input_ids": input_batch} # map函数的作用是将tokenize函数应用到数据集的每一个元素上 # batched=True表示将数据集分成batch进行处理 # remove_columns=raw_datasets["train"].column_names表示删除原始数据集的列名 tokenized_datasets = raw_datasets.map( tokenize, batched=True,num_proc=20, remove_columns=raw_datasets["train"].column_names ) print("tokenize dataset info") # print(tokenized_datasets) # eos_token的作用是表示文本的结束 # pad_token的作用是表示填充的token tokenizer.pad_token = tokenizer.eos_token # DataCollatorForLanguageModeling的作用是将数据集转换为模型可以处理的输入格式 # 这里使用的是Qwen2-0.5B的Tokenizer,将文本转换为模型可以处理的输入格式 # mlm=False表示不进行masked language modeling data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False) # 加载配置 config = GPT2Config.from_pretrained("/root/workspace/workspace/gpt2-large") config.vocab_size = vocab_size # 更新配置中的 vocab_size # 检查配置中的 vocab_size print(f"Model config vocab size: {config.vocab_size}") # 直接实例化自定义模型,无需注册 model = CustomModelWithRotary(config) model_size = sum(t.numel() for t in model.parameters()) print("Model Config:") print(config) print(f"Model Size: {model_size/1000**2:.1f}M parameters") # 加载各个所需的指标 accuracy_metric = evaluate.load('./metrics/accuracy') def compute_metrics(eval_preds): logits, labels = eval_preds # 获取预测结果(取logits中概率最大的索引) preds = np.argmax(logits, axis=-1) # 形状: [batch_size, sequence_length] labels = labels[:, 1:].reshape(-1) preds = preds[:, :-1].reshape(-1) # 计算每个标记的准确度 accuracy = accuracy_metric.compute(predictions=preds, references=labels) return accuracy import random def compute_metrics_partial(eval_preds, subset_ratio=0.5): # 这里假设 eval_preds 是一个 list,包含 logits labels logits, labels = eval_preds # 随机选择部分批次进行计算 batch_size = logits.shape[0] subset_size = int(batch_size * subset_ratio) # 计算子集的大小 selected_indices = random.sample(range(batch_size), subset_size) # 获取预测结果(只对选定的批次进行计算) selected_logits = logits[selected_indices] selected_labels = labels[selected_indices] preds = np.argmax(selected_logits, axis=-1) # shape: [subset_size, sequence_length] selected_labels = selected_labels[:, 1:].reshape(-1) selected_preds = preds[:, :-1].reshape(-1) # 计算准确度 accuracy = accuracy_metric.compute(predictions=selected_preds, references=selected_labels) return accuracy # train args = transformers.TrainingArguments( output_dir="./GPT2-SELFIES", per_device_train_batch_size=3, # 每个GPU的训练batch数 per_device_eval_batch_size=3, # 每个GPU的测试batch数 eval_strategy="steps", eval_steps=2, logging_steps=5, gradient_accumulation_steps=8, # 梯度累计总数 num_train_epochs=100, # 训练epoch数 lr_scheduler_type="cosine", # 学习率衰减策略 learning_rate=1e-5, # 基础学习率, save_steps=500, save_total_limit=10, bf16=True, # 开启bf16训练, 对于Amper架构以下的显卡建议替换为fp16=True ) print("Train Args:") print(args) # enjoy training trainer = transformers.Trainer( model=model, tokenizer=tokenizer, args=args, data_collator=data_collator, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], compute_metrics=compute_metrics, callbacks=[swanlab_callback], ) trainer.train() # save model trainer.save_model("./GPT2-SELFIES/Weight") # 保存模型的路径 # generate pipe = transformers.pipeline("text-generation", model=model, tokenizer=tokenizer) print("GENERATE:", pipe("人工智能", num_return_sequences=1)[0]["generated_text"]) prompts = ["牛顿", "北京市", "亚洲历史"] examples = [] for i in range(3): # 根据提示词生成数据 text = pipe(prompts[i], num_return_sequences=1)[0]["generated_text"] text = swanlab.Text(text) examples.append(text) swanlab.log({"Generate": examples}) if __name__ == "__main__": main()解决报错 File "/root/workspace/gpt2-large.py", line 245, in <module> main() File "/root/workspace/gpt2-large.py", line 148, in main model = CustomModelWithRotary(config) File "/root/workspace/gpt2-large.py", line 51, in __init__ self.base_model = transformers.AutoModelForCausalLM.from_pretrained( File "/opt/conda/lib/python3.10/site-packages/transformers/models/auto/auto_factory.py", line 564, in from_pretrained return model_class.from_pretrained( File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 262, in _wrapper return func(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4319, in from_pretrained ) = cls._load_pretrained_model( File "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py", line 4955, in _load_pretrained_model raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") Error(s) in loading state_dict for GPT2LMHeadModel: size mismatch for wte.weight: copying a param with shape torch.Size([50257, 1280]) from checkpoint, the shape in current model is torch.Size([3289, 1280]). You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.使用全新词汇表
最新发布
07-23
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值