[论文笔记] LLM数据集——LongData-Corpus_llm 简单数据集-优快云博客

本文链接：https://blog.youkuaiyun.com/Trance95/article/details/141597713

https://huggingface.co/datasets/yuyijiong/LongData-Corpus

1、hf的数据

在开发机上要设置ssh key，然后cat复制之后在设置在hf上

2、中文小说数据在云盘上

下载：

#!/bin/bash

# Base URL
base_url="https://cloud.tsinghua.edu.cn/d/0670fcb14d294c97b5cf/files/?p=%2F%E4%B8%AD%E6%96%87%E5%B0%8F%E8%AF%B4_"

# Loop to download files from 1 to 85
for i in {1..85}
do
  # Format the number with leading zeros (e.g., 001, 002, ..., 085)
  num=$(printf "%d" $i)

  # Construct the URL
  url="${base_url}${num}.jsonl.zst&dl=1"

  # Download the file
  wget -O "中文小说_${num}.jsonl.zst" "$url"

  # Check if download was successful
  if [ $? -eq 0 ]; then
    echo "Downloaded: 中文小说_${num}.jsonl.zst"
  else
    echo "Failed to download: 中文小说_${num}.jsonl.zst"
  fi
done

jieya.py 解压：

import os
import zstandard as zstd

# 输入目录（包含所有.zst文件）
input_dir = "./"  # 替换为你的文件目录

# 解压每个文件
for i in range(1, 86):  # 序号从1到85
    file_name = f"中文小说_{i}.jsonl.zst"
    input_path = os.path.join(input_dir, file_name)
    output_file_name = file_name.replace(".zst", "")
    output_path = os.path.join(input_dir, output_file_name)
    
    # 解压.zst文件
    with open(input_path, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        with open(output_path, 'wb') as output_file:
            dctx.copy_stream(compressed_file, output_file)
    
    print(f"解压完成: {output_file_name}")

print("所有文件解压完成。")

convert.py：转换成jsonl

import json
import csv
import zstandard as zstd

def convert_json_to_jsonl(json_file, jsonl_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    with open(jsonl_file, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

def convert_csv_to_jsonl(csv_file, jsonl_file):
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        with open(jsonl_file, 'w', encoding='utf-8') as out_file:
            for row in reader:
                out_file.write(json.dumps(row, ensure_ascii=False) + '\n')

def convert_zst_to_jsonl(zst_file, jsonl_file):
    with open(zst_file, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            with open(jsonl_file, 'w', encoding='utf-8') as out_file:
                for line in reader:
                    out_file.write(line.decode('utf-8'))

if __name__ == "__main__":
    # Convert json files
    convert_json_to_jsonl('万卷-专利-16k-16715条.json', '万卷-专利-16k-16715条.jsonl')
    convert_json_to_jsonl('万卷-新闻-16k-2490条.json', '万卷-新闻-16k-2490条.jsonl')
    convert_json_to_jsonl('中外名著71本.json', '中外名著71本.jsonl')
    convert_json_to_jsonl('金庸小说15本.json', '金庸小说15本.jsonl')

    # Convert csv files
    convert_csv_to_jsonl('学习强国1.6w字以上459条.csv', '学习强国1.6w字以上459条.jsonl')
    convert_csv_to_jsonl('悟道200G数据-32000字以上-16000条.csv', '悟道200G数据-32000字以上-16000条.jsonl')
    convert_csv_to_jsonl('政府工作报告1.6w字以上170条.csv', '政府工作报告1.6w字以上170条.jsonl')
    convert_csv_to_jsonl('中文维基百科-16000字以上-708条.csv', '中文维基百科-16000字以上-708条.jsonl')

    # Convert zst files
    convert_zst_to_jsonl('CCI中文互联网语料-大于16k字-30000条.jsonl.zst', 'CCI中文互联网语料-大于16k字-30000条.jsonl')

key.py：把key是'content'和'内容'的，变成'text'

import os
import json

# 指定目录路径
dir_path = "/mnt/shared/AI-QIHUAN/kexin/datasets/long_corpus/LongData-Corpus/LongData_zh"

# 遍历目录下的所有.jsonl文件
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith(".jsonl"):
            file_path = os.path.join(root, file)
            modified_lines = []
            modified = False

            # 逐行读取文件内容
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    data = json.loads(line.strip())
                    
                    # 检查并修改键名
                    if '内容' in data:
                        data['text'] = data.pop('内容')
                        modified = True
                    elif 'content' in data:
                        data['text'] = data.pop('content')
                        modified = True
                    
                    # 将修改后的数据添加到列表中
                    modified_lines.append(json.dumps(data, ensure_ascii=False))

            # 如果有任何修改，覆盖写回文件
            if modified:
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write('\n'.join(modified_lines) + '\n')

print("所有文件处理完成！")