shutil.copy(old_path, train_path)

在使用Python的shutil.copy进行大量图片拷贝时,由于遇到相同文件名的情况,导致实际拷贝数量少于预期。解决办法是添加文件重命名策略,确保每个图片都有唯一文件名。

shutil.copy在拷贝图片时候,默认:相同文件名存储为一个文件。

今日本来要储存不同文件夹下的40000张图片,但由于不同文件夹下有相同名称的文件,所以总数总是达不到40000。

# 作者:优快云-笑脸惹桃花 https://blog.csdn.net/qq_67105081?type=blog # github:peng-xiaobai https://github.com/peng-xiaobai/Dataset-Conversion import os import shutil import random # random.seed(0) #随机种子,可自选开启 def split_data(file_path, label_path, new_file_path, train_rate, val_rate, test_rate): images = os.listdir(file_path) labels = os.listdir(label_path) images_no_ext = {os.path.splitext(image)[0]: image for image in images} labels_no_ext = {os.path.splitext(label)[0]: label for label in labels} matched_data = [(img, images_no_ext[img], labels_no_ext[img]) for img in images_no_ext if img in labels_no_ext] unmatched_images = [img for img in images_no_ext if img not in labels_no_ext] unmatched_labels = [label for label in labels_no_ext if label not in images_no_ext] if unmatched_images: print("未匹配的图片文件:") for img in unmatched_images: print(images_no_ext[img]) if unmatched_labels: print("未匹配的标签文件:") for label in unmatched_labels: print(labels_no_ext[label]) random.shuffle(matched_data) total = len(matched_data) train_data = matched_data[:int(train_rate * total)] val_data = matched_data[int(train_rate * total):int((train_rate + val_rate) * total)] test_data = matched_data[int((train_rate + val_rate) * total):] # 处理训练集 for img_name, img_file, label_file in train_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'train', 'images') new_label_dir = os.path.join(new_file_path, 'train', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) # 处理验证集 for img_name, img_file, label_file in val_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'val', 'images') new_label_dir = os.path.join(new_file_path, 'val', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) # 处理测试集 for img_name, img_file, label_file in test_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'test', 'images') new_label_dir = os.path.join(new_file_path, 'test', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) print("数据集已划分完成") if __name__ == '__main__': file_path = r"G:\data\JPEGImages" # 图片文件夹 label_path = r'G:\data\labels' # 标签文件夹 new_file_path = r'G:\VOCdevkit' # 新数据存放位置 split_data(file_path, label_path, new_file_path, train_rate=0.8, val_rate=0.1, test_rate=0.1)
08-06
# 作者:优快云-笑脸惹桃花 https://blog.csdn.net/qq_67105081?type=blog # github:peng-xiaobai https://github.com/peng-xiaobai/Dataset-Conversion import os import shutil import random # random.seed(0) #随机种子,可自选开启 def split_data(file_path, label_path, new_file_path, train_rate, val_rate, test_rate): images = os.listdir(file_path) labels = os.listdir(label_path) images_no_ext = {os.path.splitext(image)[0]: image for image in images} labels_no_ext = {os.path.splitext(label)[0]: label for label in labels} matched_data = [(img, images_no_ext[img], labels_no_ext[img]) for img in images_no_ext if img in labels_no_ext] unmatched_images = [img for img in images_no_ext if img not in labels_no_ext] unmatched_labels = [label for label in labels_no_ext if label not in images_no_ext] if unmatched_images: print("未匹配的图片文件:") for img in unmatched_images: print(images_no_ext[img]) if unmatched_labels: print("未匹配的标签文件:") for label in unmatched_labels: print(labels_no_ext[label]) random.shuffle(matched_data) total = len(matched_data) train_data = matched_data[:int(train_rate * total)] val_data = matched_data[int(train_rate * total):int((train_rate + val_rate) * total)] test_data = matched_data[int((train_rate + val_rate) * total):] # 处理训练集 for img_name, img_file, label_file in train_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'train', 'images') new_label_dir = os.path.join(new_file_path, 'train', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) # 处理验证集 for img_name, img_file, label_file in val_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'val', 'images') new_label_dir = os.path.join(new_file_path, 'val', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) # 处理测试集 for img_name, img_file, label_file in test_data: old_img_path = os.path.join(file_path, img_file) old_label_path = os.path.join(label_path, label_file) new_img_dir = os.path.join(new_file_path, 'test', 'images') new_label_dir = os.path.join(new_file_path, 'test', 'labels') os.makedirs(new_img_dir, exist_ok=True) os.makedirs(new_label_dir, exist_ok=True) shutil.copy(old_img_path, os.path.join(new_img_dir, img_file)) shutil.copy(old_label_path, os.path.join(new_label_dir, label_file)) print("数据集已划分完成") if __name__ == '__main__': file_path = r"f:\data\JPEGImages" # 图片文件夹 label_path = r'f:\data\labels' # 标签文件夹 new_file_path = r"f:\VOCdevkit" # 新数据存放位置 split_data(file_path, label_path, new_file_path, train_rate=0.8, val_rate=0.1, test_rate=0.1)
10-15
import os import shutil import pandas as pd from PIL import Image def merge_datasets_with_simple_names(bearing1_1_folder, bearing1_2_folder, bearing1_3_folder, train_output_folder, test_output_folder): """ 合并Bearing1_1和Bearing1_2作为训练集,Bearing1_3作为测试集,使用简化命名 参数: bearing1_1_folder: Bearing1_1文件夹路径 bearing1_2_folder: Bearing1_2文件夹路径 bearing1_3_folder: Bearing1_3文件夹路径 train_output_folder: 训练集输出文件夹路径 test_output_folder: 测试集输出文件夹路径 """ # 创建输出文件夹 if not os.path.exists(train_output_folder): os.makedirs(train_output_folder) print(f"创建训练集文件夹: {train_output_folder}") if not os.path.exists(test_output_folder): os.makedirs(test_output_folder) print(f"创建测试集文件夹: {test_output_folder}") # 读取各个轴承的标签文件 labels_b1_1 = pd.read_csv(os.path.join(bearing1_1_folder, "sliding_window_rul_labels_corrected.csv")) labels_b1_2 = pd.read_csv(os.path.join(bearing1_2_folder, "sliding_window_rul_labels_corrected.csv")) labels_b1_3 = pd.read_csv(os.path.join(bearing1_3_folder, "sliding_window_rul_labels_corrected.csv")) # 为每个轴承添加标识列 labels_b1_1['bearing'] = 'B1_1' labels_b1_2['bearing'] = 'B1_2' labels_b1_3['bearing'] = 'B1_3' # 合并训练集标签 train_labels = pd.concat([labels_b1_1, labels_b1_2], ignore_index=True) # 处理文件名冲突 - 使用简化命名 print("处理训练集图像文件名...") train_image_counter = 0 for idx, row in train_labels.iterrows(): old_filename = row['filename'] bearing = row['bearing'] # 使用简化的命名: t_序号.png (t表示训练集) new_filename = f"t_{train_image_counter:05d}.png" # 更新标签文件中的文件名 train_labels.at[idx, 'filename'] = new_filename # 复制并重命名图像文件 source_path = os.path.join(bearing1_1_folder if bearing == 'B1_1' else bearing1_2_folder, old_filename) dest_path = os.path.join(train_output_folder, new_filename) if os.path.exists(source_path): shutil.copy2(source_path, dest_path) train_image_counter += 1 else: print(f"警告: 找不到文件 {source_path}") # 保存训练集标签 train_labels_path = os.path.join(train_output_folder, "train_labels.csv") train_labels.to_csv(train_labels_path, index=False) print(f"训练集标签已保存到: {train_labels_path}") # 处理测试集 - 使用简化命名 print("处理测试集图像...") test_labels = labels_b1_3.copy() test_image_counter = 0 for idx, row in test_labels.iterrows(): old_filename = row['filename'] # 使用简化的命名: te_序号.png (te表示测试集) new_filename = f"te_{test_image_counter:05d}.png" # 更新标签文件中的文件名 test_labels.at[idx, 'filename'] = new_filename # 复制并重命名图像文件 source_path = os.path.join(bearing1_3_folder, old_filename) dest_path = os.path.join(test_output_folder, new_filename) if os.path.exists(source_path): shutil.copy2(source_path, dest_path) test_image_counter += 1 else: print(f"警告: 找不到文件 {source_path}") # 保存测试集标签 test_labels_path = os.path.join(test_output_folder, "test_labels.csv") test_labels.to_csv(test_labels_path, index=False) print(f"测试集标签已保存到: {test_labels_path}") # 简化标签文件,只保留必要列 simplified_train_labels = train_labels[['filename', 'rul_normalized', 'bearing']] simplified_test_labels = test_labels[['filename', 'rul_normalized', 'bearing']] # 保存简化版标签 simplified_train_path = os.path.join(train_output_folder, "simple_train_labels.csv") simplified_test_path = os.path.join(test_output_folder, "simple_test_labels.csv") simplified_train_labels.to_csv(simplified_train_path, index=False) simplified_test_labels.to_csv(simplified_test_path, index=False) print(f"简化版训练集标签已保存到: {simplified_train_path}") print(f"简化版测试集标签已保存到: {simplified_test_path}") # 打印统计信息 print(f"\n数据集划分完成:") print(f"训练集样本数: {len(train_labels)}") print(f"测试集样本数: {len(test_labels)}") print(f"训练集轴承: B1_1 ({len(labels_b1_1)} 样本), B1_2 ({len(labels_b1_2)} 样本)") print(f"测试集轴承: B1_3 ({len(labels_b1_3)} 样本)") return simplified_train_labels, simplified_test_labels # 主程序 if __name__ == "__main__": # 设置路径 bearing1_1_folder = r"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\WT_SlidingWindow_Corrected\Bearing1_1" bearing1_2_folder = r"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\WT_SlidingWindow_Corrected\Bearing1_2" bearing1_3_folder = r"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\WT_SlidingWindow_Corrected\Bearing1_3" train_output_folder = r"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\WT_SlidingWindow_Corrected\train" test_output_folder = r"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\WT_SlidingWindow_Corrected\test" # 合并数据集 train_labels, test_labels = merge_datasets_with_simple_names( bearing1_1_folder, bearing1_2_folder, bearing1_3_folder, train_output_folder, test_output_folder ) # 打印数据预览 print("\n简化版训练集标签预览:") print(train_labels.head()) print("\n简化版测试集标签预览:") print(test_labels.head()) # 显示简化后的文件名示例 print("\n文件名示例:") print(f"训练集: {train_labels['filename'].iloc[0]}") print(f"测试集: {test_labels['filename'].iloc[0]}")参考上述数据集制作的方法,训练集采用"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\CWT\Bearing1_1hdck"和"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\CWT\Bearing1_2hdck",结果保存至"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\CWT\shujuji"。测试集采用"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\CWT\Bearing1_3hdck",结果保存至"D:\成电——研究生\基于数据驱动的故障诊断研究\数据集汇总\phm-ieee-2012-data-challenge-dataset-master\CWT\ceshiji"。上述代码仅供参考,可进行适当修改,给出完整代码
最新发布
10-15
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值