[论文笔记] LLM数据集——LongData-Corpus

https://huggingface.co/datasets/yuyijiong/LongData-Corpus

1、hf的数据

在开发机上要设置ssh key,然后cat复制之后在设置在hf上

2、中文小说数据在云盘上

清华大学云盘

下载:

#!/bin/bash

# Base URL
base_url="https://cloud.tsinghua.edu.cn/d/0670fcb14d294c97b5cf/files/?p=%2F%E4%B8%AD%E6%96%87%E5%B0%8F%E8%AF%B4_"

# Loop to download files from 1 to 85
for i in {1..85}
do
  # Format the number with leading zeros (e.g., 001, 002, ..., 085)
  num=$(printf "%d" $i)

  # Construct the URL
  url="${base_url}${num}.jsonl.zst&dl=1"

  # Download the file
  wget -O "中文小说_${num}.jsonl.zst" "$url"

  # Check if download was successful
  if [ $? -eq 0 ]; then
    echo "Downloaded: 中文小说_${num}.jsonl.zst"
  else
    echo "Failed to download: 中文小说_${num}.jsonl.zst"
  fi
done

jieya.py 解压:

import os
import zstandard as zstd

# 输入目录(包含所有.zst文件)
input_dir = "./"  # 替换为你的文件目录

# 解压每个文件
for i in range(1, 86):  # 序号从1到85
    file_name = f"中文小说_{i}.jsonl.zst"
    input_path = os.path.join(input_dir, file_name)
    output_file_name = file_name.replace(".zst", "")
    output_path = os.path.join(input_dir, output_file_name)
    
    # 解压.zst文件
    with open(input_path, 'rb') as compressed_file:
        dctx = zstd.ZstdDecompressor()
        with open(output_path, 'wb') as output_file:
            dctx.copy_stream(compressed_file, output_file)
    
    print(f"解压完成: {output_file_name}")

print("所有文件解压完成。")

convert.py:转换成jsonl

import json
import csv
import zstandard as zstd

def convert_json_to_jsonl(json_file, jsonl_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    with open(jsonl_file, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

def convert_csv_to_jsonl(csv_file, jsonl_file):
    with open(csv_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        with open(jsonl_file, 'w', encoding='utf-8') as out_file:
            for row in reader:
                out_file.write(json.dumps(row, ensure_ascii=False) + '\n')

def convert_zst_to_jsonl(zst_file, jsonl_file):
    with open(zst_file, 'rb') as f:
        dctx = zstd.ZstdDecompressor()
        with dctx.stream_reader(f) as reader:
            with open(jsonl_file, 'w', encoding='utf-8') as out_file:
                for line in reader:
                    out_file.write(line.decode('utf-8'))

if __name__ == "__main__":
    # Convert json files
    convert_json_to_jsonl('万卷-专利-16k-16715条.json', '万卷-专利-16k-16715条.jsonl')
    convert_json_to_jsonl('万卷-新闻-16k-2490条.json', '万卷-新闻-16k-2490条.jsonl')
    convert_json_to_jsonl('中外名著71本.json', '中外名著71本.jsonl')
    convert_json_to_jsonl('金庸小说15本.json', '金庸小说15本.jsonl')

    # Convert csv files
    convert_csv_to_jsonl('学习强国1.6w字以上459条.csv', '学习强国1.6w字以上459条.jsonl')
    convert_csv_to_jsonl('悟道200G数据-32000字以上-16000条.csv', '悟道200G数据-32000字以上-16000条.jsonl')
    convert_csv_to_jsonl('政府工作报告1.6w字以上170条.csv', '政府工作报告1.6w字以上170条.jsonl')
    convert_csv_to_jsonl('中文维基百科-16000字以上-708条.csv', '中文维基百科-16000字以上-708条.jsonl')

    # Convert zst files
    convert_zst_to_jsonl('CCI中文互联网语料-大于16k字-30000条.jsonl.zst', 'CCI中文互联网语料-大于16k字-30000条.jsonl')

key.py:把key是'content'和'内容'的,变成'text'

import os
import json

# 指定目录路径
dir_path = "/mnt/shared/AI-QIHUAN/kexin/datasets/long_corpus/LongData-Corpus/LongData_zh"

# 遍历目录下的所有.jsonl文件
for root, dirs, files in os.walk(dir_path):
    for file in files:
        if file.endswith(".jsonl"):
            file_path = os.path.join(root, file)
            modified_lines = []
            modified = False

            # 逐行读取文件内容
            with open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    data = json.loads(line.strip())
                    
                    # 检查并修改键名
                    if '内容' in data:
                        data['text'] = data.pop('内容')
                        modified = True
                    elif 'content' in data:
                        data['text'] = data.pop('content')
                        modified = True
                    
                    # 将修改后的数据添加到列表中
                    modified_lines.append(json.dumps(data, ensure_ascii=False))

            # 如果有任何修改,覆盖写回文件
            if modified:
                with open(file_path, 'w', encoding='utf-8') as f:
                    f.write('\n'.join(modified_lines) + '\n')

print("所有文件处理完成!")

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

心心喵

喵喵(*^▽^*)

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值