Input Data Source中的role

本文详细解释了数据科学流程中不同数据角色(如RAW、TRAIN、VALIDATE等)的作用及如何影响后续节点处理过程。特别强调了在防止过拟合方面的关键作用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Help中的内容如下:
Choose the data Role from the drop-list. The role determines how the data set is used throughout the process flow. The role may be any of the following:

  • RAW - used as raw input to the node (default).
  • TRAIN - used to fit initial models.
  • VALIDATE - used by default for model assessment. The VALIDATE data set is also used for fine tuning the model. Note:   The Decision Tree and Neural Network nodes have the capacity of over fitting the TRAIN data set. To prevent these nodes from over fitting the TRAIN data set, the VALIDATE data set is automatically used to retreat to a simpler fit than the fit based on the TRAIN data alone. The VALIDATE data set can also be used by the Regression node for fine tuning stepwise regression models.   
  • TEST - additional "hold out" data set that you can use for model assessment.
  • SCORE - used to score a new data set that may not contain the target.
If you do not have training, validation, and test data sets, then you can create them with a successor Data Partition node.
******************************************8

但是这些role的用法究竟如何?如何影响后续的节点?

 

import json import torch from typing import Dict, List from torch.utils.data import Dataset import transformers from peft import LoraConfig, TaskType, get_peft_model from torch.utils.data import DataLoader, SequentialSampler from transformers import Trainer, TrainingArguments from lora_plus import LoraPlusTrainer from torch.utils.data import RandomSampler def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> tuple[int, int]: if target_len * 2 < cutoff_len: # truncate source max_target_len = cutoff_len elif source_len * 2 < cutoff_len: # truncate target max_target_len = cutoff_len - source_len else: # truncate both max_target_len = int(cutoff_len * (target_len / (source_len + target_len))) new_target_len = min(max_target_len , target_len) max_source_len = max(cutoff_len - new_target_len, 0) new_source_len = min(max_source_len, source_len) return new_source_len, new_target_len class SupervisedDataset(Dataset): """Dataset for supervised fine-tuning.""" def __init__( self, data_path, tokenizer, model_max_length, user_tokens=[128011], assistant_tokens=[128012], ): super(SupervisedDataset, self).__init__() self.data = json.load(open(data_path)) self.tokenizer = tokenizer self.model_max_length = model_max_length self.user_tokens = user_tokens self.assistant_tokens = assistant_tokens self.ignore_index = -100 # 测试第一条数据是否正确处理 item = self.preprocessing(self.data[200]) print("input:", self.tokenizer.decode(item["input_ids"])) labels = [id_ for id_ in item["labels"] if id_ != -100] # 过滤 -100 的标签 def __len__(self): return len(self.data) def preprocessing(self, example): input_ids = [] labels = [] # 将用户和助手的内容配对 messages = example["conversations"] pairs = [] current_user_encoded = None # 将 user 和 assistant 配对,并将其打包成编码后的 pairs for message in messages: if message["role"] == "user": # 编码用户消息 current_user_encoded = [self.tokenizer.bos_token_id] + self.user_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) elif message["role"] == "assistant" and current_user_encoded is not None: # 编码助手消息 assistant_encoded = self.assistant_tokens + self.tokenizer.encode( message["content"], add_special_tokens=False ) # 配对形成一个 (source_ids, target_ids) pairs.append((current_user_encoded, assistant_encoded)) current_user_encoded = None total_length = 0 # 初始化总长度 # 逐对处理编码后的 (source_ids, target_ids) for turn_idx, (source_ids, target_ids) in enumerate(pairs): # 检查是否超出最大长度,若超出则停止处理 if total_length >= self.model_max_length: print("Exceeded max length, stopping processing further turns.") break # 动态截断长度 source_len, target_len = infer_seqlen( len(source_ids), len(target_ids), self.model_max_length - total_length ) source_ids = source_ids[:source_len] target_ids = target_ids[:target_len] # 更新总长度 total_length += source_len + target_len source_label = [self.tokenizer.bos_token_id] + [self.ignore_index] * (source_len-1) target_label = target_ids # 数据拼接 input_ids += source_ids + target_ids labels += source_label + target_label # 添加 EOS 标记 input_ids += [self.tokenizer.eos_token_id] labels += [self.tokenizer.eos_token_id] # 转换为 Tensor input_ids = torch.LongTensor(input_ids) labels = torch.LongTensor(labels) # 构造 attention_mask attention_mask = attention_mask = input_ids.ne(self.tokenizer.pad_token_id) return { "input_ids": input_ids, "labels": labels, "attention_mask": attention_mask, } def __getitem__(self, idx) -> Dict[str, torch.Tensor]: return self.preprocessing(self.data[idx]) tokenizer = transformers.AutoTokenizer.from_pretrained( '/data/coding/Weight', use_fast=False, trust_remote_code=True, model_max_length=1024, ) train_dataset = SupervisedDataset( '/data/coding/transformers_from_scratch-main/Deepsmiles_SFT.json', tokenizer, model_max_length=1024 ) data_collator = transformers.DataCollatorForSeq2Seq( tokenizer=tokenizer) model = transformers.AutoModelForCausalLM.from_pretrained( "/data/coding/Weight",trust_remote_code=True, torch_dtype="auto") lora_config = LoraConfig( r=8, lora_alpha=16, target_modules=[ "up_proj", "gate_proj", "o_proj", "q_proj", "v_proj", "down_proj", "k_proj" ], # 目标注意力层 lora_dropout=0.0, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # 输出示例:0.3% 参数可训练 training_args = TrainingArguments( output_dir="./LLM_SFT1", per_device_train_batch_size=4, gradient_accumulation_steps=8, num_train_epochs=3, learning_rate=5.0e-05, optim="adamw_torch", logging_steps=10, bf16=True, save_strategy="steps", lr_scheduler_type='cosine', max_grad_norm=1.0, save_steps=2000, warmup_steps=0 ) class CustomTrainer(LoraPlusTrainer): def get_train_dataloader(self) -> DataLoader: """ Returns the training dataloader using a random sampler to shuffle the dataset. """ return DataLoader( self.train_dataset, batch_size=self.args.train_batch_size, shuffle=True, collate_fn=self.data_collator, drop_last=False, ) # 使用修改后的 CustomTrainer lp_trainer = CustomTrainer( model, training_args, train_dataset=train_dataset, tokenizer=tokenizer, data_collator=data_collator ) lp_trainer.train() lp_trainer.save_model(output_dir='./LLM_SFT1')这是我监督微调模型的代码,{ "conversations": [ { "role": "user", "content": "Element:C C C C C C C C C C C C C C C C C C C C C C C C C H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H H O O O O O.Mass Error:-0.000694316.Precursor type:[M+H]+.Spectrum:[418.2719243, '129.0193:25.569620 143.0853:27.341772 145.1016:30.632911 157.1008:20.759494 159.1165:32.911392 169.1026:33.417722 173.1333:72.151899 199.1479:100.000000 225.163:46.835443']" }, { "role": "assistant", "content": "O=COCCCCCC=CC=CCC)CCOC=O)CC)C)CC)))))C6%10))))))))C)))))CCO)C6" } ] }这是其中的一条数据,我希望我希望Spectrum:[418.2719243, '129.0193:25.569620 143.0853:27.341772 145.1016:30.632911 157.1008:20.759494 159.1165:32.911392 169.1026:33.417722 173.1333:72.151899 199.1479:100.000000 225.163:46.835443']里的每个带小数的数字都是单独的一个token处理不是说将数字和小数点拆分开
最新发布
06-28
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值