https://www.modelscope.cn/datasets/AI-ModelScope/COIG-CQIA/quickstart
from modelscope.msdatasets import MsDataset
ds = MsDataset.load('AI-ModelScope/COIG-CQIA', subset_name='wiki', split='train')
采样
import os
import random
# 设置采样比率
sample_rate = 0.05 # 10%的采样比率
# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'
total_data_count = 0
# 计算整个数据集的数据总量
file_line_counts = []
for domain in os.listdir(root_folder_path):
domain_path = os.path.join(root_folder_path, domain)
if os.path.isdir(domain_path):
jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
for jsonl_file in jsonl_files:
file_path = os.path.join(domain_path, jsonl_file)
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
total_data_count += len(lines)
file_line_counts.append((file_path, lines))
# 打印总行数
print("Total lines before sampling:", total_data_count)
# 新JSONL文件的路径
output_file_path = '/data3/zhouqiang/LLM_scratch/dataset/combined_sampled_data.jsonl'
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
sampled_total_count = 0 # 采样后的总行数
# 按固定比例采样并合并到一个新文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
for file_path, lines in file_line_counts:
sampled_lines = random.sample(lines, max(1, int(len(lines) * sample_rate))) # 确保至少采样1行
sampled_total_count += len(sampled_lines)
output_file.writelines(sampled_lines)
# 打印采样后总行数
print("Total lines after sampling:", sampled_total_count)
print(f"Sampling completed and combined data is stored in: {output_file_path}")
import os
import random
# 设置调整因子 k,根据实际数据调整这个值
k = 500 # 需要根据数据的规模和分布进行调整
# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'
domain_data_counts = {}
total_data_count = 0
# 计算每个领域的数据总量
for domain in os.listdir(root_folder_path):
domain_path = os.path.join(root_folder_path, domain)
if os.path.isdir(domain_path):
jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
domain_total = 0
for jsonl_file in jsonl_files:
file_path = os.path.join(domain_path, jsonl_file)
with open(file_path, 'r', encoding='utf-8') as file:
lines_count = sum(1 for line in file)
domain_total += lines_count
domain_data_counts[domain] = domain_total
total_data_count += domain_total
# 新JSONL文件的路径
output_file_path = '/data3/zhouqiang/LLM_scratch/dataset/inverse_sampled_data.jsonl'
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
# 按逆比例采样并合并到一个新文件
with open(output_file_path, 'w', encoding='utf-8') as output_file:
for domain, count in domain_data_counts.items():
if count > 0: # 检查以避免除以零
sample_rate = k / count # 计算逆比例采样率
sample_rate = min(sample_rate, 1) # 保证采样率不超过100%
domain_path = os.path.join(root_folder_path, domain)
jsonl_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
for jsonl_file in jsonl_files:
file_path = os.path.join(domain_path, jsonl_file)
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
sampled_lines = random.sample(lines, max(1, int(len(lines) * sample_rate)))
output_file.writelines(sampled_lines)
else:
print(f"No data to sample in domain: {domain}")
print(f"Inverse sampling completed and combined data is stored in: {output_file_path}")
画图
import os
import matplotlib.pyplot as plt
import pandas as pd
# 总文件夹路径,其中包含各个领域的子文件夹
root_folder_path = '/data3/zhouqiang/LLM_scratch/dataset/COIG-CQIA'
# 获取所有领域文件夹
domains = [d for d in os.listdir(root_folder_path) if os.path.isdir(os.path.join(root_folder_path, d))]
domain_counts = []
# 遍历每个领域文件夹,计算JSON文件数量
for domain in domains:
domain_path = os.path.join(root_folder_path, domain)
json_files = [f for f in os.listdir(domain_path) if f.endswith('.jsonl')]
domain_counts.append(len(json_files))
# 创建DataFrame
df = pd.DataFrame({
'Domain': domains,
'Counts': domain_counts
})
# 创建输出文件夹保存图像
output_folder = '/data3/zhouqiang/LLM_scratch/dataset/output_images'
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 绘制并保存条形图
for i, row in df.iterrows():
plt.figure() # 创建新的图形
plt.bar(row['Domain'], row['Counts'], color='skyblue')
plt.title(f'Number of JSON Files in {row["Domain"]}')
plt.xlabel('Domain')
plt.ylabel('Number of JSON Files')
plt.xticks(rotation=45)
plt.tight_layout() # 调整布局以避免标签重叠
# 图像保存路径
file_path = os.path.join(output_folder, f'{row["Domain"]}.jpg')
plt.savefig(file_path)
plt.close() # 关闭图形,避免内存泄漏