parquet转换为json数据格式:
这是源码,但是个人跑有点问题将大模型指令微调数据从parquet转为json格式_parquet转json-优快云博客
下面是优化后代码,增加在保存 JSON 文件之前,将所有ndarray
对象转换为 JSON 可序列化的类型,比如 Python 的列表
import os
import json
import random
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import numpy as np # 导入numpy库以处理ndarray
def read_json_file(file_path):
try:
with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)
return data
except FileNotFoundError:
print(f"File {file_path} not found.")
except json.JSONDecodeError:
print(f"File {file_path} is not a valid JSON file.")
except Exception as e:
print(f"An error occurred: {e}")
def read_jsonl_file(file_path):
data = []
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
try:
data.append(json.loads(line))
except:
print(line)
1 / 0
return data
def read_praquet_file(file_path):
table = pq.read_table(file_path)
df = table.to_pandas()
result = [row.tolist() for _, row in df.iterrows()]
return result
def save_json(file_path, data):
with open(file_path, 'w', encoding='utf-8') as file:
json.dump(data, file, indent=4, ensure_ascii=False)
print(f'Save {file_path} is ok!')
def save_jsonl(file_path, data):
try:
with open(file_path, 'w', encoding='utf-8') as file:
for item in data:
file.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"Data saved to {file_path}")
except Exception as e:
print(f"An error occurred while saving the data: {e}")
def save_parquet(file_path, data):
if isinstance(data, list):
data = pd.DataFrame(data)
if not isinstance(data, pd.DataFrame):
raise ValueError("data must be a pandas DataFrame or a list of lists")
pq.write_table(pa.Table.from_pandas(data), file_path)
print(f'Save {file_path} is ok!')
def convert_lists_to_json(df):
"""Convert lists in DataFrame to JSON strings."""
for column in df.columns:
if df[column].apply(lambda x: isinstance(x, list)).any():
df[column] = df[column].apply(lambda x: json.dumps(x) if isinstance(x, list) else x)
return df
# 辅助函数:将嵌套数据结构中的所有ndarray对象转换为列表
def convert_ndarrays_to_lists(data):
if isinstance(data, np.ndarray):
return data.tolist()
elif isinstance(data, list):
return [convert_ndarrays_to_lists(item) for item in data]
elif isinstance(data, dict):
return {key: convert_ndarrays_to_lists(value) for key, value in data.items()}
return data
root = 'e://ultralytics-main/datasets/stack_java'
save_path = 'train285.json'
new_data = []
dirs = os.listdir(root)
print(dirs)
for one in dirs:
if one.endswith('.parquet'):
print(one)
file_path = root + '/' + one
data = read_praquet_file(file_path)
for x in data:
inp = x[3]
res = x[4]
new_entry = {
"conversations": [
{
"role": "user",
"content": inp
},
{
"role": "assistant",
"content": res
}
]
}
if len(inp) > 0 and len(res) > 0:
new_data += [new_entry]
# 在保存JSON文件之前,将所有ndarray对象转换为列表
new_data = convert_ndarrays_to_lists(new_data)
save_json(save_path, new_data)