# 3. 从JSONL文件加载数据集
dataset = load_dataset("json", data_files="train_data.jsonl")["train"] # 加载JSONL文件
# Tokenize 数据
def tokenize_function(examples):
prompts = []
responses = []
for msg_list in examples["message"]:
# 拼接对话历史
prompt = ""
response = ""
for msg in msg_list:
if msg["role"] == "user":
prompt += f"用户: {msg['content']}\n" # 累加用户消息
else:
response = msg["content"] # 最后一条助手消息作为目标输出
# 构建最终提示(用户消息 + 助手回复前缀)
prompts.append(prompt + "助手:")
responses.append(response)
# 处理输入和标签
model_inputs = tokenizer(prompts, max_length=512, truncation=True, padding=True)
labels = tokenizer(responses, max_length=128, truncation=True, padding=True)["input_ids"]
# 掩盖输入部分的标签(仅计算助手回复的loss)
model_inputs["labels"] = [
[-100] * (len(input_ids) - len(label)) + label
for input_ids, label in zip(model_inputs["input_ids"], labels)
]
return model_inputs
tokenized_dataset = dataset.map(tokenize_function, batched=True)
Generating train split: 0 examples [00:00, ? examples/s]Failed to load JSON from file 'F:\Programmer\python\MyAI\train_data.jsonl' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Invalid value. in row 14
Generating train split: 0 examples [00:00, ? examples/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:174, in Json._generate_tables(self, files)
171 with open(
172 file, encoding=self.config.encoding, errors=self.config.encoding_errors
173 ) as f:
--> 174 df = pandas_read_json(f)
175 except ValueError:
File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:38, in pandas_read_json(path_or_buf, **kwargs)
37 kwargs["dtype_backend"] = "pyarrow"
---> 38 return pd.read_json(path_or_buf, **kwargs)
File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:815, in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options, dtype_backend, engine)
814 else:
--> 815 return json_reader.read()
File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1014, in JsonReader.read(self)
1013 else:
-> 1014 obj = self._get_object_parser(self.data)
1015 if self.dtype_backend is not lib.no_default:
File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1040, in JsonReader._get_object_parser(self, json)
1039 if typ == "frame":
-> 1040 obj = FrameParser(json, **kwargs).parse()
1042 if typ == "series" or obj is None:
File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1176, in Parser.parse(self)
1174 @final
1175 def parse(self):
-> 1176 self._parse()
1178 if self.obj is None:
File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1392, in FrameParser._parse(self)
1390 if orient == "columns":
1391 self.obj = DataFrame(
-> 1392 ujson_loads(json, precise_float=self.precise_float), dtype=None
1393 )
1394 elif orient == "split":
ValueError: Trailing data
During handling of the above exception, another exception occurred:
ArrowInvalid Traceback (most recent call last)
File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1818, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1817 _time = time.time()
-> 1818 for _, table in generator:
1819 if max_shard_size is not None and writer._num_bytes > max_shard_size:
File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:177, in Json._generate_tables(self, files)
176 logger.error(f"Failed to load JSON from file '{file}' with error {type(e)}: {e}")
--> 177 raise e
178 if df.columns.tolist() == [0]:
File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:151, in Json._generate_tables(self, files)
150 try:
--> 151 pa_table = paj.read_json(
152 io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size)
153 )
154 break
File e:\Python311\python11\Lib\site-packages\pyarrow\_json.pyx:342, in pyarrow._json.read_json()
File e:\Python311\python11\Lib\site-packages\pyarrow\error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status()
File e:\Python311\python11\Lib\site-packages\pyarrow\error.pxi:92, in pyarrow.lib.check_status()
ArrowInvalid: JSON parse error: Invalid value. in row 14
The above exception was the direct cause of the following exception:
DatasetGenerationError Traceback (most recent call last)
Cell In[4], line 2
1 # 3. 从JSONL文件加载数据集
----> 2 dataset = load_dataset("json", data_files="train_data.jsonl")["train"] # 加载JSONL文件
4 # Tokenize 数据
5 def tokenize_function(examples):
File e:\Python311\python11\Lib\site-packages\datasets\load.py:1417, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs)
1414 return builder_instance.as_streaming_dataset(split=split)
1416 # Download and prepare data
-> 1417 builder_instance.download_and_prepare(
1418 download_config=download_config,
1419 download_mode=download_mode,
1420 verification_mode=verification_mode,
1421 num_proc=num_proc,
1422 storage_options=storage_options,
1423 )
1425 # Build dataset for splits
1426 keep_in_memory = (
1427 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1428 )
File e:\Python311\python11\Lib\site-packages\datasets\builder.py:897, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
895 if num_proc is not None:
896 prepare_split_kwargs["num_proc"] = num_proc
--> 897 self._download_and_prepare(
898 dl_manager=dl_manager,
899 verification_mode=verification_mode,
900 **prepare_split_kwargs,
901 **download_and_prepare_kwargs,
902 )
903 # Sync info
904 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values())
File e:\Python311\python11\Lib\site-packages\datasets\builder.py:973, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs)
969 split_dict.add(split_generator.split_info)
971 try:
972 # Prepare split will record examples associated to the split
--> 973 self._prepare_split(split_generator, **prepare_split_kwargs)
974 except OSError as e:
975 raise OSError(
976 "Cannot find data file. "
977 + (self.manual_download_instructions or "")
978 + "\nOriginal error:\n"
979 + str(e)
980 ) from None
File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1705, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size)
1703 job_id = 0
1704 with pbar:
-> 1705 for job_id, done, content in self._prepare_split_single(
1706 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
1707 ):
1708 if done:
1709 result = content
File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1861, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id)
1859 if isinstance(e, DatasetGenerationError):
1860 raise
-> 1861 raise DatasetGenerationError("An error occurred while generating the dataset") from e
1863 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths)
DatasetGenerationError: An error occurred while generating the dataset
最新发布