自动删除相同的图片，自动处理大量图片

不会抓鼠鼠的阿猫

已于 2024-02-25 00:12:01 修改

阅读量724

点赞数 8

分类专栏： # 图像算法文章标签： python 图像处理

于 2024-02-25 00:09:11 首次发布

本文链接：https://blog.youkuaiyun.com/zxf2ld/article/details/136277683

版权

图像算法专栏收录该内容

1 篇文章

订阅专栏

文章介绍了一种利用Python脚本的方法，通过均值哈希、差值哈希、pHash以及直方图比较来检测和删除文件夹中重复的图片，提高文件管理效率。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

经常会遇到文件夹里有很多重复图片的情况。

这种时候手动去删除会很浪费时间，可以采用以下代码对相同的图片进行自动删除，让每一个图片都被不重不漏地保留下来。

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import shutil

import cv2
import numpy as np
import os
import pandas as pd
import traceback

# 均值哈希算法
def aHash(img,shape=(10,10)):
    # 缩放为10*10
    img = cv2.resize(img, shape)
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # s为像素和初值为0，hash_str为hash值初值为''
    s = 0
    hash_str = ''
    # 遍历累加求像素和
    for i in range(shape[0]):
        for j in range(shape[1]):
            s = s + gray[i, j]
    # 求平均灰度
    avg = s / 100
    # 灰度大于平均值为1相反为0生成图片的hash值
    for i in range(shape[0]):
        for j in range(shape[1]):
            if gray[i, j] > avg:
                hash_str = hash_str + '1'
            else:
                hash_str = hash_str + '0'
    return hash_str

# 差值感知算法
def dHash(img,shape=(10,10)):
    # 缩放10*11
    img = cv2.resize(img, (shape[0]+1, shape[1]))
    # 转换灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hash_str = ''
    # 每行前一个像素大于后一个像素为1，相反为0，生成哈希
    for i in range(shape[0]):
        for j in range(shape[1]):
            if gray[i, j] > gray[i, j + 1]:
                hash_str = hash_str + '1'
            else:
                hash_str = hash_str + '0'
    return hash_str


# 感知哈希算法(pHash)
def pHash(img,shape=(10,10)):
    # 缩放32*32
    img = cv2.resize(img, (32, 32))  # , interpolation=cv2.INTER_CUBIC

    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 将灰度图转为浮点型，再进行dct变换
    dct = cv2.dct(np.float32(gray))
    # opencv实现的掩码操作
    dct_roi = dct[0:10, 0:10]

    hash = []
    avreage = np.mean(dct_roi)
    for i in range(dct_roi.shape[0]):
        for j in range(dct_roi.shape[1]):
            if dct_roi[i, j] > avreage:
                hash.append(1)
            else:
                hash.append(0)
    return hash


# 通过得到RGB每个通道的直方图来计算相似度
def classify_hist_with_split(image1, image2, size=(256, 256)):
    # 将图像resize后，分离为RGB三个通道，再计算每个通道的相似值
    image1 = cv2.resize(image1, size)
    image2 = cv2.resize(image2, size)
    sub_image1 = cv2.split(image1)
    sub_image2 = cv2.split(image2)
    sub_data = 0
    for im1, im2 in zip(sub_image1, sub_image2):
        sub_data += calculate(im1, im2)
    sub_data = sub_data / 3
    return sub_data


# 计算单通道的直方图的相似值
def calculate(image1, image2):
    hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
    hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
    # 计算直方图的重合度
    degree = 0
    for i in range(len(hist1)):
        if hist1[i] != hist2[i]:
            degree = degree + (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
        else:
            degree = degree + 1
    degree = degree / len(hist1)
    return degree

# Hash值对比
def cmpHash(hash1, hash2,shape=(10,10)):
    n = 0
    # hash长度不同则返回-1代表传参出错
    if len(hash1)!=len(hash2):
        return -1
    # 遍历判断
    for i in range(len(hash1)):
        # 相等则n计数+1，n最终为相似度
        if hash1[i] == hash2[i]:
            n = n + 1
    return n/(shape[0]*shape[1])

def remove_similarity_picture(path,s):
    # 罗列path下所有文件，返回文件名数组
    file = os.listdir(os.path.join(path,s))
    print(file)
    # 遍历文件
    for i, o in enumerate(file):
        # img1 = cv2.imread(os.path.join(path, o))
        # 判断该图像是否还在文件夹中，因为有可能因为重复已经被删除了
        file1 = os.listdir(os.path.join(path,s))
        if o in file1:
            print(1)
            # 若还存在文件夹中就读取该图片文件
            img1=cv2.imdecode(np.fromfile(os.path.join(path, s, o),dtype=np.uint8),-1)
            # print('img1:',n)
            # 遍历当前文件后面的所有文件
            for a, b in enumerate(file[(i + 1):]):
                # img2 = cv2.imread(os.path.join(path, b))
                # 同样的，也要判断一下该文件夹中还有没有这张图片，因为有可能会有两张以上的重复图像，都会被删除
                file2 = os.listdir(os.path.join(path,s))
                if b in file2:
                    # 若还存在就读取后面的这些图像文件
                    img2 =cv2.imdecode(np.fromfile(os.path.join(path, s, b), dtype=np.uint8), -1)
                    # print('img2:',b)
                    # print(img1, img2)
                    print(o,b,"的相似度如下")
                    # hash1 = aHash(img1)
                    # hash2 = aHash(img2)
                    # n = cmpHash(hash1, hash2)
                    # print('均值哈希算法相似度：', n)
                    #
                    # hash1 = dHash(img1)
                    # hash2 = dHash(img2)
                    # n = cmpHash(hash1, hash2)
                    # print('差值哈希算法相似度：', n)
                    #
                    # hash1 = pHash(img1)
                    # hash2 = pHash(img2)
                    # n = cmpHash(hash1, hash2)
                    # print('感知哈希算法相似度：', n)
                    #
                    # n = calculate(img1, img2)
                    # print('单通道的直方图算法相似度：', n)
                    
                    # 计算相似度
                    try:
                        n = classify_hist_with_split(img1, img2)
                        print('三直方图算法相似度：', n)
                    except Exception as err:
                        info = traceback.format_exc()
                        print(info)
                        os.remove(os.path.join(path,s,b))
                        pass
                        
                    
                    # 如果相似度大于0.95，判定为相似
                    if n>=0.95:
                        print('相似')
                        # 把这些相似图像文件删除
                        oldpath=os.path.join(path,s,b)
                        os.remove(oldpath)



def main():
    
    path = 'D:/Project/test'
    dt = pd.read_csv(path+"/LoadPicture.csv",header=0,sep=',')
    # print(dt.columns)
    
    path_list = []
    for i in dt['Parent SKU']:
        if i not in path_list:
            path_list.append(os.path.join(path,i))
    # print(path_list)
    
    for s in path_list:
        remove_similarity_picture(path,s)
    
if __name__=="__main__":
    main()

有时候大量的文件或者图片需要进行删除，批量修改文件名，移动到其他文件夹，检查该图片是否损坏，或者批量做裁剪，下面就是做这些操作的代码脚本


import os
import shutil
import glob
from PIL import Image
import imghdr
import cv2

# 批量删除xml文件
def my_remove(base_path):

    files = os.listdir('D:/Project/CycleGan/path/facades')
    # print(files)
    for file in files:
        path = os.path.join(base_path,file)
        # print(paths)
        if file[-3:] == 'xml':
            print(file)
            os.remove(path)

# 批量修改文件名
def my_filechange():
    files = os.listdir('C:/Users/Yafex/Desktop/cut_file/clean')

    for i,filename in enumerate(files):   #‘D:/Project/CycleGan/path/to/data/B/train/’是文件夹路径，你也可以替换其他=
        newname = str(900000 + i) + '.jpg'  #把jpg替换成png

        print(filename)
        print(newname)
        os.rename('C:/Users/Yafex/Desktop/cut_file/clean/'+filename, 'C:/Users/Yafex/Desktop/cut_file/clean/'+newname)
# my_filechange()

# 批量移动文件
def my_move(srcfn, dstdir):  ##定义移动函数，参数为待移动的文件路径和目标目录
    if not os.path.isfile(srcfn):  ##判断文件是否存在
        print('srcfn error')

    else:
        srcdir, fn = os.path.split(srcfn)  ##分离绝对路径和相对路径，获得文件名

        if not os.path.exists(dstdir):  ##如果目录不存在，创建目录
            os.makedirs(dstdir)

        dstfn = dstdir + fn  ##生成目录下的文件名
        shutil.move(srcfn, dstfn)  ##移动

# 批量移动文件
def move_all():
    fns = glob.glob('C:/Users/Yafex/Desktop/data/trainA/*.jpg')  ##获取当前目录下所有jpg格式的文件

    src_test = [fn for fn in fns if not (int(fn[-12:-6]) % 5)]

    # src_test = [fn for fn in fns if fn[-9] == 'x' and 1 <= int(fn[-8:-4]) <= 120]  ##获取当前目录下的'x0001-x0120'，存为一个list，作为test集，
    src_val = [fn for fn in fns if fn[-9] == 'b' and 1 <= int(fn[-8:-4]) <= 120]  ##同理，获取'b0001-b0120'作为val集
    print(src_test)
    for ind in range(len(src_test)):  ##循环移动所有文件

        my_move(src_test[ind], 'C:/Users/Yafex/Desktop/data/valA/')

    # for ind in range(len(src_val)):  ##循环移动所有文件
    #
    #     my_move(src_val[ind], 'D:/Project/CycleGan/path/to/data/A/val/')

move_all()


# 批量检查图片是否损坏
def check_picture():
    base = 'C:/Users/Yafex/Desktop/shuiyin'

    files = os.listdir(base)
    for f in files:
        if f[-3:] != 'txt':
            file = os.path.join(base,f)
            img = imghdr.what(file)
            if img is None:
            # if file == 'D:/Project/CycleGan/datasets/watermark2clean/trainB\\2012_003375.jpg':
                print(file)
                print(img)
            # print(file)
# check_picture()

# 批量裁剪图片
def make_bbox():

    path = "D:/Project/test_cutting" # 图片和检测坐标文件夹
    path3 = "D:/Project/bboxcut" # 裁剪出来的小图要保存的目录
    # w = 1000 # 原始图片resize
    # h = 1000
    img_total = [] # img空列表
    txt_total = [] # txt空列表

    file = os.listdir(path) # 获取所有文件
    for filename in file: # 遍历所有文件
        first,last = os.path.splitext(filename) # 分离文件名与扩展名，如'a.jpg'，分离完之后，结果为'a'和'.jpg'
        if last == ".png": # 图片的后缀名如果是jpg
            img_total.append(first) # 就把该文件名存到img_total列表中
        #print(img_total)
        else:
            txt_total.append(first) # 否则就存到txt_total列表中

    for img_ in img_total: # 遍历img的文件名（不含后缀）
        if img_ in txt_total: # 如果该文件名在txt列表中
            print("文件名：",img_)
            filename_img = img_+".png" # 就直接将其加上图片的后缀名，作为图片名
            # print('filename_img:', filename_img)
            path1 = os.path.join(path,filename_img) # 拼接路径和文件名
            img = cv2.imread(path1) # 按路径读取图片
            # img = cv2.resize(img,(w,h),interpolation = cv2.INTER_CUBIC) # resize图像大小，否则roi区域可能会报错
            h,w,c = img.shape
            filename_txt = img_+".txt" # 拿到对应的检测框数据x_center,y_center,width,height
            # print('filename_txt:', filename_txt)
            n = 1 # 设n为1
            with open(os.path.join(path,filename_txt),"r+",encoding="utf-8",errors="ignore") as f: # 打开txt的检测数据文档
                for line in f: # 遍历该文档
                    aa = line.split(" ") # 按空格切分
                    x_center,y_center,width,height = float(aa[2]),float(aa[3]),float(aa[4]),float(aa[5])   # aa[1]左上点的x_center坐标，aa[2]左上点的y_center坐标，aa[3]图片width，aa[4]图片height

                    lefttopx = int(w * (x_center-width/2.0)) # x_center-width/2，这是左上角的x坐标
                    leftdownx = int(w * (x_center+width/2.0))
                    lefttopy = int(h * (y_center-height/2.0)) # 注意，yolo中的y坐标系是朝下的，和常规坐标系相反，所以y_center-height/2是左上角的y坐标
                    leftdowny = int(h * (y_center+height / 2.0))

                    if lefttopy < 3:
                        lefttopy = 3
                    if lefttopx < 3:
                        lefttopx = 3
                    if leftdowny > h - 3:
                        lefttopy = h - 3
                    if leftdownx > w - 3:
                        lefttopx = w - 3

                    roi = img[lefttopy-3:leftdowny+3,lefttopx-3:leftdownx+3] # 按照[左上y:右下y,左上x:右下x]裁剪小图，其中(y1:y2,x1:x2)需要调参，否则裁剪出来的小图可能不太好
                    # print('roi:', roi) # 如果不resize图片统一大小，可能会得到有的roi为[]导致报错
                    filename_last = img_+"_"+str(n)+".jpg" # 裁剪出来的小图文件名，设为图片名+n的结构，格式为jpg
                    path2 = os.path.join(path3,"roi") # 需要在path3路径下创建一个roi文件夹
                    cv2.imwrite(os.path.join(path2,filename_last),roi) # 将该图片写入roi文件夹中
                    n = n+1 # n+1
        else:
            continue

# make_bbox()

def txt_match_jpg():
    jpg_list = os.listdir('C:/Users/Yafex/Desktop/yolov5-master/datasets/watermark/images/train')

    txt_list = os.listdir('C:/Users/Yafex/Desktop/yolov5-master/datasets/watermark/labels/train')

    for j,t in zip(jpg_list,txt_list):

        print(j,t)

# txt_match_jpg()