解决pd.read_json报错ValueError: Trailing data问题

465 篇文章 ¥39.90 ¥99.00
在Python中使用pandas的read_json读取JSON数据时,可能会遇到ValueError: Trailing data错误,这通常是因为JSON格式不规范。解决方法是在调用read_json时添加errors='ignore'参数,忽略不规范数据。

解决pd.read_json报错ValueError: Trailing data问题

在使用Python进行数据处理中,我们经常需要将JSON格式的数据读入到DataFrame中进行处理。而使用pandas库的read_json函数,就可以方便地将JSON数据读入到DataFrame中。但是有时候在读取JSON数据时,会遇到一个非常常见的错误:ValueError: Trailing data。这个错误通常是由于JSON文件格式不规范或者结构混乱所致。

这个错误的解决方案并不难,只需要在read_json中添加一个参数即可。具体方法如下:

import pandas as pd

filename = "data.json"
with open(filename, 'r') as<
# 3.JSONL文件加载数据集 dataset = load_dataset("json", data_files="train_data.jsonl")["train"] # 加载JSONL文件 # Tokenize 数据 def tokenize_function(examples): prompts = [] responses = [] for msg_list in examples["message"]: # 拼接对话历史 prompt = "" response = "" for msg in msg_list: if msg["role"] == "user": prompt += f"用户: {msg['content']}\n" # 累加用户消息 else: response = msg["content"] # 最后一条助手消息作为目标输出 # 构建最终提示(用户消息 + 助手回复前缀) prompts.append(prompt + "助手:") responses.append(response) # 处理输入和标签 model_inputs = tokenizer(prompts, max_length=512, truncation=True, padding=True) labels = tokenizer(responses, max_length=128, truncation=True, padding=True)["input_ids"] # 掩盖输入部分的标签(仅计算助手回复的loss) model_inputs["labels"] = [ [-100] * (len(input_ids) - len(label)) + label for input_ids, label in zip(model_inputs["input_ids"], labels) ] return model_inputs tokenized_dataset = dataset.map(tokenize_function, batched=True) Generating train split: 0 examples [00:00, ? examples/s]Failed to load JSON from file 'F:\Programmer\python\MyAI\train_data.jsonl' with error <class 'pyarrow.lib.ArrowInvalid'>: JSON parse error: Invalid value. in row 14 Generating train split: 0 examples [00:00, ? examples/s] --------------------------------------------------------------------------- ValueError Traceback (most recent call last) File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:174, in Json._generate_tables(self, files) 171 with open( 172 file, encoding=self.config.encoding, errors=self.config.encoding_errors 173 ) as f: --> 174 df = pandas_read_json(f) 175 except ValueError: File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:38, in pandas_read_json(path_or_buf, **kwargs) 37 kwargs["dtype_backend"] = "pyarrow" ---> 38 return pd.read_json(path_or_buf, **kwargs) File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:815, in read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates, keep_default_dates, precise_float, date_unit, encoding, encoding_errors, lines, chunksize, compression, nrows, storage_options, dtype_backend, engine) 814 else: --> 815 return json_reader.read() File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1014, in JsonReader.read(self) 1013 else: -> 1014 obj = self._get_object_parser(self.data) 1015 if self.dtype_backend is not lib.no_default: File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1040, in JsonReader._get_object_parser(self, json) 1039 if typ == "frame": -> 1040 obj = FrameParser(json, **kwargs).parse() 1042 if typ == "series" or obj is None: File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1176, in Parser.parse(self) 1174 @final 1175 def parse(self): -> 1176 self._parse() 1178 if self.obj is None: File e:\Python311\python11\Lib\site-packages\pandas\io\json\_json.py:1392, in FrameParser._parse(self) 1390 if orient == "columns": 1391 self.obj = DataFrame( -> 1392 ujson_loads(json, precise_float=self.precise_float), dtype=None 1393 ) 1394 elif orient == "split": ValueError: Trailing data During handling of the above exception, another exception occurred: ArrowInvalid Traceback (most recent call last) File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1818, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1817 _time = time.time() -> 1818 for _, table in generator: 1819 if max_shard_size is not None and writer._num_bytes > max_shard_size: File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:177, in Json._generate_tables(self, files) 176 logger.error(f"Failed to load JSON from file '{file}' with error {type(e)}: {e}") --> 177 raise e 178 if df.columns.tolist() == [0]: File e:\Python311\python11\Lib\site-packages\datasets\packaged_modules\json\json.py:151, in Json._generate_tables(self, files) 150 try: --> 151 pa_table = paj.read_json( 152 io.BytesIO(batch), read_options=paj.ReadOptions(block_size=block_size) 153 ) 154 break File e:\Python311\python11\Lib\site-packages\pyarrow\_json.pyx:342, in pyarrow._json.read_json() File e:\Python311\python11\Lib\site-packages\pyarrow\error.pxi:155, in pyarrow.lib.pyarrow_internal_check_status() File e:\Python311\python11\Lib\site-packages\pyarrow\error.pxi:92, in pyarrow.lib.check_status() ArrowInvalid: JSON parse error: Invalid value. in row 14 The above exception was the direct cause of the following exception: DatasetGenerationError Traceback (most recent call last) Cell In[4], line 2 1 # 3.JSONL文件加载数据集 ----> 2 dataset = load_dataset("json", data_files="train_data.jsonl")["train"] # 加载JSONL文件 4 # Tokenize 数据 5 def tokenize_function(examples): File e:\Python311\python11\Lib\site-packages\datasets\load.py:1417, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, verification_mode, keep_in_memory, save_infos, revision, token, streaming, num_proc, storage_options, **config_kwargs) 1414 return builder_instance.as_streaming_dataset(split=split) 1416 # Download and prepare data -> 1417 builder_instance.download_and_prepare( 1418 download_config=download_config, 1419 download_mode=download_mode, 1420 verification_mode=verification_mode, 1421 num_proc=num_proc, 1422 storage_options=storage_options, 1423 ) 1425 # Build dataset for splits 1426 keep_in_memory = ( 1427 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size) 1428 ) File e:\Python311\python11\Lib\site-packages\datasets\builder.py:897, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, verification_mode, dl_manager, base_path, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs) 895 if num_proc is not None: 896 prepare_split_kwargs["num_proc"] = num_proc --> 897 self._download_and_prepare( 898 dl_manager=dl_manager, 899 verification_mode=verification_mode, 900 **prepare_split_kwargs, 901 **download_and_prepare_kwargs, 902 ) 903 # Sync info 904 self.info.dataset_size = sum(split.num_bytes for split in self.info.splits.values()) File e:\Python311\python11\Lib\site-packages\datasets\builder.py:973, in DatasetBuilder._download_and_prepare(self, dl_manager, verification_mode, **prepare_split_kwargs) 969 split_dict.add(split_generator.split_info) 971 try: 972 # Prepare split will record examples associated to the split --> 973 self._prepare_split(split_generator, **prepare_split_kwargs) 974 except OSError as e: 975 raise OSError( 976 "Cannot find data file. " 977 + (self.manual_download_instructions or "") 978 + "\nOriginal error:\n" 979 + str(e) 980 ) from None File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1705, in ArrowBasedBuilder._prepare_split(self, split_generator, file_format, num_proc, max_shard_size) 1703 job_id = 0 1704 with pbar: -> 1705 for job_id, done, content in self._prepare_split_single( 1706 gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args 1707 ): 1708 if done: 1709 result = content File e:\Python311\python11\Lib\site-packages\datasets\builder.py:1861, in ArrowBasedBuilder._prepare_split_single(self, gen_kwargs, fpath, file_format, max_shard_size, job_id) 1859 if isinstance(e, DatasetGenerationError): 1860 raise -> 1861 raise DatasetGenerationError("An error occurred while generating the dataset") from e 1863 yield job_id, True, (total_num_examples, total_num_bytes, writer._features, num_shards, shard_lengths) DatasetGenerationError: An error occurred while generating the dataset
最新发布
11-25
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值