https://huggingface.co/datasets/yuyijiong/LongData-Corpus
1、hf的数据
在开发机上要设置ssh key,然后cat复制之后在设置在hf上
2、中文小说数据在云盘上
下载:
#!/bin/bash
# Base URL
base_url="https://cloud.tsinghua.edu.cn/d/0670fcb14d294c97b5cf/files/?p=%2F%E4%B8%AD%E6%96%87%E5%B0%8F%E8%AF%B4_"
# Loop to download files from 1 to 85
for i in {1..85}
do
# Format the number with leading zeros (e.g., 001, 002, ..., 085)
num=$(printf "%d" $i)
# Construct the URL
url="${base_url}${num}.jsonl.zst&dl=1"
# Download the file
wget -O "中文小说_${num}.jsonl.zst" "$url"
# Check if download was successful
if [ $? -eq 0 ]; then
echo "Downloaded: 中文小说_${num}.jsonl.zst"
else
echo "Failed to download: 中文小说_${num}.jsonl.zst"
fi
done
jieya.py 解压:
import os
import zstandard as zstd
# 输入目录(包含所有.zst文件)
input_dir = "./" # 替换为你的文件目录
# 解压每个文件
for i in range(1, 86): # 序号从1到85
file_name = f"中文小说_{i}.jsonl.zst"
input_path = os.path.join(input_dir, file_name)
output_file_name = file_name.replace(".zst", "")
output_path = os.path.join(input_dir, output_file_name)
# 解压.zst文件
with open(input_path, 'rb') as compressed_file:
dctx = zstd.ZstdDecompressor()
with open(output_path, 'wb') as output_file:
dctx.copy_stream(compressed_file, output_file)
print(f"解压完成: {output_file_name}")
print("所有文件解压完成。")
convert.py:转换成jsonl
import json
import csv
import zstandard as zstd
def convert_json_to_jsonl(json_file, jsonl_file):
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
with open(jsonl_file, 'w', encoding='utf-8') as f:
for entry in data:
f.write(json.dumps(entry, ensure_ascii=False) + '\n')
def convert_csv_to_jsonl(csv_file, jsonl_file):
with open(csv_file, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
with open(jsonl_file, 'w', encoding='utf-8') as out_file:
for row in reader:
out_file.write(json.dumps(row, ensure_ascii=False) + '\n')
def convert_zst_to_jsonl(zst_file, jsonl_file):
with open(zst_file, 'rb') as f:
dctx = zstd.ZstdDecompressor()
with dctx.stream_reader(f) as reader:
with open(jsonl_file, 'w', encoding='utf-8') as out_file:
for line in reader:
out_file.write(line.decode('utf-8'))
if __name__ == "__main__":
# Convert json files
convert_json_to_jsonl('万卷-专利-16k-16715条.json', '万卷-专利-16k-16715条.jsonl')
convert_json_to_jsonl('万卷-新闻-16k-2490条.json', '万卷-新闻-16k-2490条.jsonl')
convert_json_to_jsonl('中外名著71本.json', '中外名著71本.jsonl')
convert_json_to_jsonl('金庸小说15本.json', '金庸小说15本.jsonl')
# Convert csv files
convert_csv_to_jsonl('学习强国1.6w字以上459条.csv', '学习强国1.6w字以上459条.jsonl')
convert_csv_to_jsonl('悟道200G数据-32000字以上-16000条.csv', '悟道200G数据-32000字以上-16000条.jsonl')
convert_csv_to_jsonl('政府工作报告1.6w字以上170条.csv', '政府工作报告1.6w字以上170条.jsonl')
convert_csv_to_jsonl('中文维基百科-16000字以上-708条.csv', '中文维基百科-16000字以上-708条.jsonl')
# Convert zst files
convert_zst_to_jsonl('CCI中文互联网语料-大于16k字-30000条.jsonl.zst', 'CCI中文互联网语料-大于16k字-30000条.jsonl')
key.py:把key是'content'和'内容'的,变成'text'
import os
import json
# 指定目录路径
dir_path = "/mnt/shared/AI-QIHUAN/kexin/datasets/long_corpus/LongData-Corpus/LongData_zh"
# 遍历目录下的所有.jsonl文件
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith(".jsonl"):
file_path = os.path.join(root, file)
modified_lines = []
modified = False
# 逐行读取文件内容
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
data = json.loads(line.strip())
# 检查并修改键名
if '内容' in data:
data['text'] = data.pop('内容')
modified = True
elif 'content' in data:
data['text'] = data.pop('content')
modified = True
# 将修改后的数据添加到列表中
modified_lines.append(json.dumps(data, ensure_ascii=False))
# 如果有任何修改,覆盖写回文件
if modified:
with open(file_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(modified_lines) + '\n')
print("所有文件处理完成!")