CoCo Caption数据集转SFT格式(parquet格式转jpg和json)

# transformer to mllm instruction-tuning data
import pyarrow.parquet as pq
import pandas as pd
import cv2
import numpy as np
import os
import json
import random
from tqdm import tqdm

instructions = [
    "Give the caption of the image.",
    "Give the caption of the image.",
    "Give the caption of the image.",
    "Give the caption of the image in English.",
    "Describe the content of the image in detail, including all the objects and their interactions.",
    "Provide a comprehensive caption that summarizes the main elements and actions in the image.",
    "Explain what is happening in the image and the context surrounding the scene.",
    "Capture the essence of the image by describing the subject, setting, and any relevant actions.",
    "Create a caption that conveys the mood and atmosphere of the image.",
    "Describe the image with attention to detail, including colors, textures, and any unique features.",
    "Write a caption that highlights the most significant aspects of the image and makes them stand out.",
    "Provide a narrative caption that tells a story based on the image's content.",
    "Craft a caption that is both informative and engaging, drawing the viewer into the image.",
    "Describe the image in a way that captures the viewer's attention and encourages further exploration of the scene.",
    "Write a caption that is concise yet informative, giving just enough detail to paint a clear picture of the image.",
    "Provide a descriptive caption that helps the viewer understand the relationship between the objects in the image.",
    "Craft a caption that is not only accurate but also evokes emotion, reflecting the sentiment of the image.",
    "Write a caption that is suitable for an audience unfamiliar with the image's context, providing enough detail for understanding."
]
# 设置数据文件夹和图像保存目录
data_dir = '/mnt/workspace/data/coco_captions/data'
image_dir = 'images'
os.makedirs(image_dir, exist_ok=True)

# 初始化JSON数据结构
chat_test = []
chat_train = []

# 遍历数据文件夹中的所有Parquet文件
for file_name in os.listdir(data_dir):
    if file_name.startswith("test"):
        output_json = chat_test
        file_path = os.path.join(data_dir, file_name)
        print(file_path)
        parquet_file = pq.ParquetFile(file_path)
        data = parquet_file.read().to_pandas()
        for index, row in tqdm(data.iterrows()):
            filename = row['filename']
            save_path = os.path.join(image_dir, filename)
            
            image_feature = row['image']['bytes']
            image_array = np.frombuffer(image_feature, dtype=np.uint8)
            image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
            if image is not None:
                cv2.imwrite(save_path, image)
            else:
                print(f"Failed to decode image for file {file_name}, row {index}.")
                continue
            
            caption = row['caption']
            cocoid = row['cocoid']
            instruction = random.choice(instructions)
            
            if random.choice([True, False]):
                instruction = f"<image>\n{instruction}"
            else:
                instruction = f"{instruction}\n<image>"
            
            conversation = {
                "id": cocoid,
                "image": filename,
                "conversations": [
                    {"from": "human", "value": instruction},
                    {"from": "gpt", "value": caption},
                ]
            }
            output_json.append(conversation)
    elif file_name.startswith(("train", "validation")):
        output_json = chat_train
        file_path = os.path.join(data_dir, file_name)
        print(file_path)
        parquet_file = pq.ParquetFile(file_path)
        data = parquet_file.read().to_pandas()
        for index, row in tqdm(data.iterrows()):
            filename = row['filename']
            save_path = os.path.join(image_dir, filename)
            
            image_feature = row['image']['bytes']
            image_array = np.frombuffer(image_feature, dtype=np.uint8)
            image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
            if image is not None:
                cv2.imwrite(save_path, image)
            else:
                print(f"Failed to decode image for file {file_name}, row {index}.")
                continue
            
            caption = row['caption']
            cocoid = row['cocoid']
            instruction = random.choice(instructions)
            
            if random.choice([True, False]):
                instruction = f"<image>\n{instruction}"
            else:
                instruction = f"{instruction}\n<image>"
            
            conversation = {
                "id": cocoid,
                "image": filename,
                "conversations": [
                    {"from": "human", "value": instruction},
                    {"from": "gpt", "value": caption},
                ]
            }
            output_json.append(conversation)

# 将JSON数据保存为文件
with open('chat_test.json', 'w') as json_file:
    json.dump(chat_test, json_file, indent=4)

with open('chat_train.json', 'w') as json_file:
    json.dump(chat_train, json_file, indent=4)

print("Finished processing and saved chat data to chat_test.json and chat_train.json.")
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Yuezero_

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值