python实现图片去重

susu1083018911

于 2025-02-19 10:22:06 发布

阅读量412

点赞数 7

文章标签： python 开发语言

本文链接：https://blog.youkuaiyun.com/susu1083018911/article/details/145721507

版权

安装必要的库

pip install Pillow

根据图片md5去重（单一，不会去掉相似图片）

去掉源文件夹中重复的图：

import os
import hashlib
from PIL import Image

def image_hash(image_path):
    """计算图片的哈希值"""
    with open(image_path, "rb") as image_file:
        # 使用MD5作为哈希算法
        hasher = hashlib.md5()
        chunk = image_file.read(8192)  # 读取文件一部分内容计算哈希值
        while chunk:
            hasher.update(chunk)
            chunk = image_file.read(8192)
        return hasher.hexdigest()

def delete_duplicate_images(directory):
    """删除目录中重复的图片"""
    image_hashes = {}  # 存储已见过的哈希值和对应的文件名列表
    deleted_count = 0  # 记录删除的图片数量

    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):  # 支持的图片格式
            file_path = os.path.join(directory, filename)
            file_hash = image_hash(file_path)
            if file_hash not in image_hashes:
                image_hashes[file_hash] = []
            image_hashes[file_hash].append(file_path)
    
    # 删除所有重复的图片，只保留每组哈希值的第一个文件
    for file_paths in image_hashes.values():
        if len(file_paths) > 1:  # 如果有多于一个文件具有相同的哈希值，则删除多余的
            for path in file_paths[1:]:  # 保留第一个文件，删除其余的
                os.remove(path)
                deleted_count += 1  # 增加删除计数
                print(f"Deleted duplicate image: {path}")

    print(f"Total {deleted_count} duplicate images have been deleted.")

# 使用示例
directory_path = 'E:\pythoncode\demo\images'  # 替换为你的文件夹路径
delete_duplicate_images(directory_path)

保留图片到新文件夹

import os
import hashlib
from PIL import Image

def calculate_hash(image_path, hash_algorithm="md5"):
    """计算图片的哈希值"""
    with open(image_path, "rb") as f:
        image_data = f.read()
        if hash_algorithm == "md5":
            return hashlib.md5(image_data).hexdigest()
        elif hash_algorithm == "sha1":
            return hashlib.sha1(image_data).hexdigest()
        else:
            raise ValueError("Unsupported hash algorithm")

def remove_duplicates(image_folder, output_folder):
    """去除重复图片"""
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    seen_hashes = set()
    for filename in os.listdir(image_folder):
        image_path = os.path.join(image_folder, filename)
        if not os.path.isfile(image_path):
            continue

        image_hash = calculate_hash(image_path)
        if image_hash not in seen_hashes:
            seen_hashes.add(image_hash)
            output_path = os.path.join(output_folder, filename)
            os.rename(image_path, output_path)
        else:
            print(f"Removing duplicate: {filename}")

# 使用示例
image_folder = "E:\pythoncode\demo\images"
output_folder = "E:\pythoncode\demo\images2"
remove_duplicates(image_folder, output_folder)

基于特征进行大规模图像去重（根据配置可去掉相似图片）

基于特征的图像去重主要分为两步：

特征提取：通过预训练模型（如 ResNet50）提取每张图像的特征向量。
相似度计算与去重：计算图像特征之间的相似度，根据设定的阈值，保留唯一的图像。

核心方法是 deduplicate_by_similarity，通过余弦相似度检测特征向量之间的相似性，逐步排除重复图像。

使用了深度学习中常见的 ResNet50 预训练模型（去掉分类层）为每张图像提取一个固定长度的特征向量。将输入图像转换为一个固定长度的特征向量。

import os
from concurrent.futures import ThreadPoolExecutor
from torchvision import models, transforms
from PIL import Image
import numpy as np
import torch
from tqdm import tqdm


class ImageDeduplicator:
    def __init__(self, model_name='resnet50', num_threads=8, gpu_id=1):
        """
        初始化去重器
        :param model_name: 使用的预训练模型名称（支持 resnet50 等 torchvision 模型）
        :param num_threads: 多线程的线程数
        :param gpu_id: 指定使用的 GPU ID，默认为 1。如果设置为 -1，则使用 CPU。
        """
        self.device = self._set_device(gpu_id)
        self.model = self._load_model(model_name).to(self.device).eval()
        self.num_threads = num_threads
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])

    def _set_device(self, gpu_id):
        """设置设备（GPU 或 CPU）"""
        if gpu_id == -1 or not torch.cuda.is_available():
            print("Using CPU for computation.")
            return torch.device("cpu")
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        print(f"Using GPU: {gpu_id}")
        return torch.device("cuda:0")

    def _load_model(self, model_name):
        """加载预训练模型，并移除分类层"""
        if model_name != 'resnet50':
            raise ValueError(f"Unsupported model: {model_name}")
        from torchvision.models import ResNet50_Weights
        weights = ResNet50_Weights.DEFAULT
        model = models.resnet50(weights=weights)
        return torch.nn.Sequential(*list(model.children())[:-1])

    def _extract_feature(self, image_path):
        """提取单张图像的特征向量"""
        try:
            image = Image.open(image_path).convert('RGB')  # 加载图像并转换为 RGB 模式
            image_tensor = self.transform(image).unsqueeze(0).to(self.device)  # 图像预处理
            with torch.no_grad():  # 关闭梯度计算
                return self.model(image_tensor).squeeze().cpu().numpy()  # 提取特征并转为 NumPy 提取完成后，所有图像的特征向量会存储为一个矩阵，形状为 (N, D)，其中 N 是图像数量，D 是特征维度（ResNet50 的默认输出为 2048 维）
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            return None

    def extract_features(self, image_paths):
        """使用多线程提取图像特征"""
        with ThreadPoolExecutor(max_workers=self.num_threads) as executor:
            results = list(tqdm(executor.map(self._extract_feature, image_paths), total=len(image_paths)))
        return np.array([res for res in results if res is not None])

    def deduplicate_by_similarity(self, features, threshold=0.95, batch_size=1000):
        """
        基于相似度进行去重
        :param features: 特征矩阵 (N x D)
        :param threshold: 相似度阈值
        :param batch_size: 批处理大小
        :return: 保留的图像索引列表
        """
        num_images = len(features)
        to_keep = []
        excluded = np.zeros(num_images, dtype=bool)  # 标记已排除的图像
        features /= np.linalg.norm(features, axis=1, keepdims=True)  # 标准化特征向量

        for i in tqdm(range(num_images), desc="Processing images", unit="image"):
            if excluded[i]:  # 跳过已标记为重复的图像
                continue
            to_keep.append(i)  # 保留当前图像

            # 找出未排除的图像索引
            remaining_indices = np.where(~excluded[i + 1:])[0] + i + 1
            if not len(remaining_indices):  # 如果没有剩余图像，则跳过
                continue

            # 分批计算余弦相似度
            for j in range(0, len(remaining_indices), batch_size):
                batch_end = min(j + batch_size, len(remaining_indices))
                batch_features = features[remaining_indices[j:batch_end]]  # 批量特征
                batch_similarities = np.dot(features[i:i+1], batch_features.T).squeeze()  # 计算相似度 为了降低内存消耗，相似度计算分批进行 利用矩阵乘法快速计算当前图像与某一批次图像间的余弦相似度
                excluded[remaining_indices[j:batch_end]] |= batch_similarities > threshold  # 标记相似度超过阈值的图像 如果相似度超过阈值（如 0.95），则将对应图像标记为重复
        # 最终返回未被标记为重复的图像索引
        return to_keep

    def deduplicate(self, image_paths, threshold=0.95, batch_size=1000):
        """
        对图像路径列表去重
        :param image_paths: 图像路径列表
        :param threshold: 相似度阈值
        :param batch_size: 批处理大小
        :return: 去重后的图像路径列表
        """
        print("Extracting features...")
        features = self.extract_features(image_paths)
        print("Performing deduplication...")
        indices_to_keep = self.deduplicate_by_similarity(features, threshold, batch_size)
        return [image_paths[i] for i in indices_to_keep]

使用：

import os
import shutil
from concurrent.futures import ThreadPoolExecutor
from torchvision import models, transforms
from PIL import Image
import numpy as np
import torch
from tqdm import tqdm
from dddd import ImageDeduplicator

# 假设你的图片存放在某个文件夹中
image_folder = "E:\code_python\pythoncode\demo\images"
output_folder = "E:\code_python\pythoncode\demo\images2"
image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder)
               if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]

# 初始化去重器
deduplicator = ImageDeduplicator(model_name='resnet50', num_threads=8, gpu_id=1)

# 调用去重方法
unique_image_paths = deduplicator.deduplicate(image_paths, threshold=0.85, batch_size=1000)

# 输出去重后的图片路径
print(f"Total unique images: {len(unique_image_paths)}")
for path in unique_image_paths:
    print(path)

# 保存去重后的图片到输出文件夹
os.makedirs(output_folder, exist_ok=True)
for path in unique_image_paths:
    shutil.copy(path, output_folder)