代码解析—part 2 数据集加载MFS—CVPR2023—Implicit Identity Leakage: The Stumbling Block to Improving Deepfake

本文链接：https://blog.youkuaiyun.com/JustWantToLearn/article/details/139092687

论文讲解请看：https://blog.youkuaiyun.com/JustWantToLearn/article/details/138758033
代码链接：https://github.com/megvii-research/CADDM
在这里，我们简要描述算法流程，着重分析模型搭建细节，以及为什么要这样搭建。
part 1：数据集准备，请看链接 https://blog.youkuaiyun.com/JustWantToLearn/article/details/138773005
part 2: 数据集加载，包含 Multi-scale Facial Swap(MFS) 模块本文
part 3：训练过程，ADM模块 https://blog.youkuaiyun.com/JustWantToLearn/article/details/139116455

1、数据集加载

这里我在作者代码基础上做了一些修改，当landmark不存在时，跳过
最重要的函数：prepare_train_input

class DeepfakeDataset(Dataset):
    r"""DeepfakeDataset Dataset.

    The folder is expected to be organized as followed: root/cls/xxx.img_ext

    Labels are indices of sorted classes in the root directory.

    Args:
        mode: train or test.
        config: hypter parameters for processing images.
    """

    def __init__(self, mode: str, config: dict):
        super().__init__()

        self.config = config
        self.mode = mode
        self.root = self.config['dataset']['img_path']
        self.landmark_path = self.config['dataset']['ld_path']
        self.rng = np.random
        assert mode in ['train', 'test']
        self.do_train = True if mode == 'train' else False
        self.info_meta_dict = self.load_landmark_json(self.landmark_path)
        self.class_dict = self.collect_class()
        self.samples = self.collect_samples()

    def load_landmark_json(self, landmark_json) -> Dict:
        with open(landmark_json, 'r') as f:
            landmark_dict = json.load(f)
        return landmark_dict
        
    def __getitem__(self, index: int) -> Tuple:
        path, label_meta = self.samples[index] #获取样本
        ld = np.array(label_meta['landmark'])#样本landmark
        label = label_meta['labels']#样本标签
        source_path = label_meta['source_path']# #样本初始数据
        img = cv2.imread(path, cv2.IMREAD_COLOR)
        source_img = cv2.imread(source_path, cv2.IMREAD_COLOR)
        if self.mode == "train":
            img, label_dict = prepare_train_input(
                img, source_img, ld, label, self.config, self.do_train
            )。#调用 prepare_train_input 函数处理训练样本
            if isinstance(label_dict, str):
                return None, label_dict

            location_label = torch.Tensor(label_dict['location_label'])
            confidence_label = torch.Tensor(label_dict['confidence_label'])
            img = torch.Tensor(img.transpose(2, 0, 1))
            return img, (label, location_label, confidence_label)

        elif self.mode == 'test':
            img, label_dict = prepare_test_input(
                [img], ld, label, self.config
            )#调用 prepare_test_input 函数处理测试样本
            img = torch.Tensor(img[0].transpose(2, 0, 1))
            video_name = label_meta['video_name']
            return img, (label, video_name)

        else:
            raise ValueError("Unsupported mode of dataset!")

    def __len__(self):
        return len(self.samples)

1.1 收集样本 collect_samples

流程：构建每个文件的完整路径 path，并从路径中提取 info_key 和 video_name。
使用 info_key 从 info_meta_dict 获取记录信息 info_meta，包括 landmark、class_label 和 source_path。将路径和标记信息元组添加到 samples 列表中。
如果出现异常（如文件没有对应的标记信息），则增加 none_nums 计数器并打印。

    def collect_samples(self) -> List:
        samples = []
        none_nums =0
        directory = os.path.expanduser(self.root)
        for key in sorted(self.class_dict.keys()):
            d = os.path.join(directory, key)
            if not os.path.isdir(d):
                continue
            for r, _, filename in sorted(os.walk(d, followlinks=True)):
                for name in sorted(filename):
                    path = os.path.join(r, name)
                    info_key = path[:-4]
                    video_name = '/'.join(path.split('/')[:-1])
                    try:
                        info_meta = self.info_meta_dict[info_key]
                        landmark = info_meta['landmark']
                        class_label = int(info_meta['label'])
                        source_path = info_meta['source_path'] + path[-4:]
                        samples.append(
                            (path, {'labels': class_label, 'landmark': landmark,
                                    'source_path': source_path,
                                    'video_name': video_name})
                        )
                    except:
                        none_nums+=1
                        print(none_nums)
        return samples

1.2 收集类别 collect_class

    def collect_class(self) -> Dict:
        #使用 os.scandir 扫描根目录中的子目录，并获取子目录名称作为类别名称列表 classes
        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
        #对 classes 进行降序排序
        classes.sort(reverse=True)
        #返回一个字典 class_dict，键为类别名称，值为类别索引
        return {classes[i]: np.int32(i) for i in range(len(classes))}

1.3 prepare_train_input

将 targetRgb 和 sourceRgb 图像存储在 images 列表中。
包含multi_scale_facial_swap、label_assign

def prepare_train_input(targetRgb, sourceRgb, landmark, label, config, training=True):
    '''Prepare model input images.

    Arguments:
    targetRgb: original images or fake images.
    sourceRgb: source images.
    landmark: face landmark.
    label: deepfake labels. genuine: 0, fake: 1.
    config: deepfake config dict.
    training: return processed image with aug or not.
    '''
    #targetRgb: 原始图像或伪造图像
    #sourceRgb:源图像
    #landmark:81个人脸标记点
    rng = np.random
    images = [targetRgb, sourceRgb]
    #如果是训练模式且随机数大于等于 0.7，则对图像和标记进行 resize_aug 数据增强。
    if training and rng.rand() >= 0.7:
        images, landmark = resize_aug(images, landmark)
    # multi-scale facial swap.

    targetRgb, sourceRgb = images
    # if input image is genuine.
    mfs_result, bbox = targetRgb, np.zeros((1, 4))
    # if input image is fake image. generate new fake image with mfs.
    if label:#如果图片为假
    #随机选择混合类型 blending_type 为 'poisson' 或 'alpha'。
        blending_type = 'poisson' if rng.rand() >= 0.5 else 'alpha'

        if rng.rand() >= 0.2:
        #如果随机数大于等于 0.2，则执行全局人脸交换：
            # global facial swap.
            sliding_win = targetRgb.shape[:2]

            if rng.rand() > 0.5:
                # fake to source global facial swap.
                mfs_result, bbox = multi_scale_facial_swap(
                    targetRgb, sourceRgb, landmark, config,
                    sliding_win, blending_type, training
                )
            elif rng.rand() >= 0.5:
                # source to fake global facial swap.
                mfs_result, bbox = multi_scale_facial_swap(
                    sourceRgb, targetRgb, landmark, config,
                    sliding_win, blending_type, training
                )
            else:

                mfs_result, bbox = targetRgb, np.array([[0, 0, 224, 224]])
                cropMfs, landmark = get_align5p(
                    [mfs_result], landmark, rng, config, training
                )
                mfs_result = cropMfs[0]
        else:
            # parial facial swap.
          #如果随机数小于 0.2，则执行部分人脸交换，通过随机选择的滑动窗口sliding_win 进行交换。
            prior_bbox = config['sliding_win']['prior_bbox']
            sliding_win = prior_bbox[np.random.choice(len(prior_bbox))]
            mfs_result, bbox = multi_scale_facial_swap(
                sourceRgb, targetRgb, landmark, config,
                sliding_win, blending_type, training
            )
    else:
        # crop face with landmark.
        #如果输入图像是真实图像，则进行面部裁剪和对齐处理
        cropMfs, landmark = get_align5p(
            [mfs_result], landmark, rng, config, training
        )
        mfs_result = cropMfs[0]

    if mfs_result is None:
        return None, 'multi scale facial swap err.'

    if training:  # and rng.rand() >= 0.5:
    #如果是训练模式，进行水平翻转和噪声添加数据增强。
        mfs_result, bbox = image_h_mirror(mfs_result, bbox)
        mfs_result = add_noise(rng, mfs_result)
    #根据标签确定图像是否为真实图像
    genuine = True if not label else False
    #计算位置标签和置信度标签
    location_label, confidence_label = label_assign(
        bbox.astype('float32') / config['crop_face']['output_size'],
        config, genuine
    )
    #返回处理后的图像和标签信息，包括原始标签、位置标签和置信度标签。
    return mfs_result, {'label': label, 'location_label': location_label,
                        'confidence_label': confidence_label}

1.3.1 label_assign

为训练数据分配标签

def label_assign(bboxs, config, genuine=False):

    global Prior
    #根据配置获取先验框信息
    get_prior(config)
    #初始化标签张量 labels
    labels = torch.zeros(bboxs.shape[0],)
    defaults = Prior.forward().data  # information of priors
    #如果输入图像为真实图像，返回全零的定位标签和置信度标签
    if genuine:
        return np.zeros(defaults.shape), np.zeros(defaults.shape[0], )

    # anchor matching by iou.

    loc_t = torch.zeros(1, defaults.shape[0], 4)
    conf_t = torch.zeros(1, defaults.shape[0])
    #调用 match 函数，根据交并比（IOU）匹配先验框和输入边界框，分配定位和置信度标签
    match(
        0.5, torch.Tensor(bboxs), defaults,
        [0.1, 0.2], labels, loc_t, conf_t, 0)

    loc_t, conf_t = np.array(loc_t)[0, ...], np.array(conf_t)[0, ...]

    if loc_t.max() > 10**5:
        return None, 'prior bbox match err. bias is inf!'
    #返回定位标签和置信度标签
    return loc_t, conf_t

1.4 prepare_test_input函数

对输入图像和面部标志点进行对齐和裁剪

def prepare_test_input(img, ld, label, config):
    config = config['crop_face']
    #将输入图像根据面部标志点对齐，并裁剪出标准尺寸的面部图像
    img, ld = align_5p(
        img, ld=ld,
        face_width=config['face_width'], canvas_size=config['output_size'],
        scale=config['scale']
    )
    return img, {'label': label}

2、MFS 函数核心

多尺度人脸交换

先进行全局或者局部人脸交换，接着生成全局或局部混合掩码
对伪造图像进行对齐和裁剪，并返回裁剪后的图像和边界框信息
global_facial_swap、partial_facial_swap、poisson_blending_func、alpha_blending_func、get_align5p、get_partial_bbox_gt

def multi_scale_facial_swap(
        srcRgb, targetRgb, landmark, config,
        sliding_win, blending_type='poisson', training=False):

    '''Multi-scale Facial Swap function.

    Argument.

    srcRgb: source image. type -> ndarray
    targetRgb: target image. type -> ndarray
    landmark: position of face landmarks.
    blending_type: image blending function. (poisson or alpha).
    '''

    assert min(sliding_win) > 0
    assert blending_type == 'poisson' or blending_type == 'alpha'

    bbox = [[0, 0, 224, 224]]
    #如果 srcRgb 和 targetRgb 的形状不同，调整 srcRgb 的大小与 targetRgb 相同
    if srcRgb.shape != targetRgb.shape:
        h, w = targetRgb.shape[:2]
        srcRgb = cv2.resize(srcRgb, (w, h))

    assert srcRgb.shape == targetRgb.shape

    swap_level = 'global' if sliding_win == targetRgb.shape[:2] else 'partial'

    # generate blending mask.

    # global facial swap. (the size of sliding window is equal to image shape)
    if swap_level == 'global':
    #全局人脸交换
        mask, blured, points = global_facial_swap(
            srcRgb, targetRgb, landmark, training)
    #部分人脸交换
    else:
        mask, blured, points = partial_facial_swap(
            srcRgb, targetRgb, landmark, sliding_win)
    #根据 blending_type 使用 poisson_blending_func 或 alpha_blending_func 进行图像混合。
    if blending_type == 'poisson':
        try:
            mfs_fake = poisson_blending_func(srcRgb, targetRgb, mask, points)
        except Exception:
            mfs_fake = alpha_blending_func(srcRgb, targetRgb, blured/255.)
    else:
        mfs_fake = alpha_blending_func(srcRgb, targetRgb, blured/255.)
    #调用 get_align5p 函数对齐并裁剪图像，返回裁剪后的图像和更新的标记点。
    images, landmark = get_align5p(
        [mfs_fake, srcRgb], landmark, np.random, config, training)

    cropMfs, cropSrc = images
    #如果是局部交换，通过 get_partial_bbox_gt 函数获取局部交换的边界框
    if swap_level == 'partial':
        partial_bbox = get_partial_bbox_gt(cropMfs, cropSrc, sliding_win)
        if partial_bbox is None:
            return None, 'partial swap err.'
        bbox.append(partial_bbox)
    #返回裁剪后的伪造图像 cropMfs 和边界框数组 bbox。
    return cropMfs, np.array(bbox)

2.1 全局人脸交换 global_facial_swap

创建一个与源图像相同大小的掩码
生成人脸标记点的凸包，并将凸包区域填充为白色
在训练模式下，有一定概率对掩码进行弹性变换
对掩码进行高斯模糊处理。
返回生成的掩码、模糊掩码和凸包点。

def global_facial_swap(srcRgb, targetRgb, landmark, training=False):
    '''Global Facial Swap.

    Argument:

    srcRgb: background image. type -> ndarray.
    targetRgb: target image. type -> ndarray.
    landmark: position of face landmarks.
    '''
    #创建一个与 srcRgb 相同形状的全零掩码图像
    mask = np.zeros(srcRgb.shape, dtype=np.uint8)
    #使用 cv2.convexHull 函数生成标记点的凸包。
    points = cv2.convexHull(
        np.array(landmark).astype('float32')
    )
    #将凸包点扩展为多边形格式。
    corners = np.expand_dims(points, axis=0).astype(np.int32)
    #使用 cv2.fillPoly 函数将凸包区域填充为白色（值为 255）的多边形。
    cv2.fillPoly(mask, corners, (255,)*3)

    if training:
        rng = np.random
        #根据一定概率对掩码进行 接受掩码、随机的变换强度、形状参数等。
        if rng.rand() > 0.5:
            mask = elastic_transform(
                mask, random.randint(300, 500), mask.shape[0] * 0.08, 0
            )

    # gaussianblur.对掩码进行高斯模糊处理，使用 cv2.GaussianBlur 函数，模糊核大小为 (5, 5)，标准差为 3。
    blured = cv2.GaussianBlur(mask, (5, 5), 3).astype('float32')
    #返回生成的掩码 mask，模糊掩码 blured 和凸包点 points。
    return mask, blured, points

2.2 部分人脸交换

生成全局交换掩码。
调用 generate_partial_swap_mask 函数生成部分交换掩码和滑动窗口边界。
对部分交换掩码进行高斯模糊处理。
返回部分交换掩码、模糊后的掩码和滑动窗口边界。
包含函数类：generate_partial_swap_mask

def partial_facial_swap(srcRgb, targetRgb, landmark, sliding_win):
    '''Partial Facial Swap.

    Argument:

    srcRgb: background image. type -> ndarray.
    targetRgb: target image. type -> ndarray.
    sliding_win: size of sliding window.
    '''
    #调用 global_facial_swap 函数生成全局交换掩码 global_swap_mask、模糊处理后的掩码 blured 和标记点 points
    global_swap_mask, blured, _ = global_facial_swap(srcRgb, targetRgb, landmark)
    #调用 generate_partial_swap_mask 函数生成部分交换掩码 partial_swap_mask 和滑动窗口的边界 points
    partial_swap_mask, points = generate_partial_swap_mask(
        targetRgb, srcRgb, global_swap_mask, landmark, sliding_win
    )
    #进行高斯模糊
    blured = cv2.GaussianBlur(partial_swap_mask, (5, 5), 3).astype('float32')
    #返回部分交换掩码 partial_swap_mask、模糊后的掩码 blured 和滑动窗口的边界 points
    return partial_swap_mask, blured, points

2.2.1 generate_partial_swap_mask

包含函数类：cut_face、cal_dssim、_sliding_bbox
切割输入图像和交换掩码。
计算图像之间的 DSSIM 以确定交换区域。
生成滑动窗口的边界和掩码。

def generate_partial_swap_mask(
        targetRgb, srcRgb, global_swap_mask,
        landmark, sliding_win):
    #提取滑动窗口的高度和宽度
    win_height, win_width = sliding_win
    # 调用 cut_face 函数，切割[targetRgb, srcRgb, global_swap_mask]，返回切割后的图像 cut_images 和裁剪边界 crop_bbox
    cut_images, crop_bbox = cut_face(
        [targetRgb, srcRgb, global_swap_mask], landmark
    )

    cutSrc, cutTarget, cutSwapmask = cut_images
#提取裁剪边界，计算cutSrc, cutTarget图像之间的 DSSIM（结构相似性差异）
    x0, y0, x1, x2 = crop_bbox
    dssim = cal_dssim(cutSrc, cutTarget)
    #将交换掩码值小于 1e-4 的区域的 DSSIM 值设为 0
    dssim[cutSwapmask < 1e-4] = 0
    #计算 DSSIM 图像尺寸相对于固定大小 224 的比例
    h_ratio, w_ratio = np.array(dssim.shape[:2]) / 224
    h, w = int(win_height * h_ratio), int(win_width * w_ratio)
    #根据比例调整滑动窗口的高度和宽度
    bbox = _sliding_bbox(dssim, h, w)
    #创建一个与背景图像相同大小的全零掩码 bbox_mask
    bbox_mask = np.zeros(srcRgb.shape)
    #在掩码上标记滑动窗口区域，值设为 255
    bbox_mask[y0+bbox[0]:y0+bbox[2], x0+bbox[1]:x0+bbox[3], :] = 255
    #返回滑动窗口掩码 bbox_mask 和滑动窗口边界 bbox
    return bbox_mask, bbox

2.2.1.1 cut face

根据面部特征点的位置裁剪图像。

首先计算特征点的最小和最大边界
根据这些边界扩展裁剪区域，并将结果应用于输入的每个图像
扩展的比例由参数 alpha 控制，裁剪后的图像保存在 cut_images 列表中，同时返回初始裁剪边界 crop_bbox

def cut_face(images, landmark, alpha=1.2):
#images传入的 [targetRgb, srcRgb, global_swap_mask]
    cut_images = list()
    #计算面部特征点在 x 轴和 y 轴上的最小和最大值，得到裁剪边界
    xmin, xmax = landmark[:, 0].min(), landmark[:, 0].max()
    ymin, ymax = landmark[:, 1].min(), landmark[:, 1].max()
    crop_bbox = (int(xmin), int(ymin), int(xmax), int(ymax))

    for img in images:
    #计算扩展后的裁剪宽度 w 和高度 h
        x0, y0, x1, y1 = crop_bbox
        w = int((x1 - x0) * alpha)
        h = int((y1 - y0) * alpha)
        x0 = int(x0 - (x1 - x0)*(alpha - 1)//2)
        y0 = int(y0 - (y1 - y0)*(alpha - 1)//2)
        cut_images.append(img[
            max(y0, 0): min(y0 + h, img.shape[0]),
            max(x0, 0):min(x0 + w, img.shape[1])
        ])
    #返回裁剪后的图像列表 cut_images 和初始裁剪边界 crop_bbox
    return cut_images, crop_bbox

2.2.1.2 计算DISSM

计算cutSrc, cutTarget图像之间的 DSSIM

def cal_dssim(img1, img2):
    '''Get dssim between the image1 and image2.

    Argument:

    img1: input image -> ndarray.
    img2: input image ->ndarray.

    return dssim: ndarray shape like img1 and img2.
    '''

    C1 = (0.01 * 255)**2
    C2 = (0.03 * 255)**2
    # 将输入图像 img1 和 img2 转换为 float64 类型，以确保计算精度
    img1 = img1.astype(np.float64)
    img2 = img2.astype(np.float64)
    #创建高斯滤波器 kernel，用于平滑图像，window 是二维高斯滤波器，由 kernel 的外积计算得到。
    kernel = cv2.getGaussianKernel(11, 1.5)
    window = np.outer(kernel, kernel.transpose())
    #使用高斯滤波器对图像进行卷积，计算 img1 和 img2 的局部均值 mu1 和 mu2
    mu1 = cv2.filter2D(img1, -1, window)
    mu2 = cv2.filter2D(img2, -1, window)
    #计算图像的平方和积
    mu1_sq = mu1**2
    mu2_sq = mu2**2
    mu1_mu2 = mu1 * mu2
    #计算 img1 和 img2 的局部方差和协方差
    sigma1_sq = cv2.filter2D(img1**2, -1, window) - mu1_sq
    sigma2_sq = cv2.filter2D(img2**2, -1, window) - mu2_sq
    sigma12 = cv2.filter2D(img1 * img2, -1, window) - mu1_mu2
    #使用均值、方差和协方差计算 SSIM 映射 ssim_map
    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) \
        / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
    #计算 DSSIM 映射 dssim，其值为 (1 - ssim_map) / 2，表示反相的 SSIM
    dssim = abs((np.ones(ssim_map.shape) - ssim_map) / 2)
    #如果 dssim 是二维的（灰度图），则将其扩展为三维的（彩色图），使其与输入图像的形状一致
    if len(dssim.shape) == 2:
        dssim = np.stack((dssim, )*3, -1)
    return dssim

2.2.1.3 滑动窗口

通过滑动窗口在给定的 DSSIM 掩码中找到 DSSIM 值最大的区域，并返回该区域的边界框

def _sliding_bbox(mask, h, w):
    '''sliding window.
    select a window size -> [h, w], sliding and find max dssim area.

    Argument:

    mask: dssim mask.
    h: sliding window hight.
    w: sliding window width.
    '''
    #step：滑动窗口的步长，设为掩码高度和宽度的五分之一
    #max_area：记录当前找到的最大 DSSIM 值区域的 DSSIM 总和，初始化为 0
    #res：记录 DSSIM 最大区域的边界框坐标，初始化为 [0, 0, 0, 0]。
    step = [mask.shape[0]//5, mask.shape[1]//5]
    max_area = 0
    res = [0] * 4
    #外层循环：i 从 0 开始，每次增加 step[0]，直到 mask 的高度
    #内层循环：j 从 0 开始，每次增加 step[1]，直到 mask 的宽度。
    for i in range(0, mask.shape[0], step[0]):
        for j in range(0, mask.shape[1], step[1]):
            if i + h <= mask.shape[0] and j + w <= mask.shape[1]:
            #计算窗口 [i:i+h, j:j+w] 内的 DSSIM 值总和
                area = np.sum(mask[i:i+h, j:j+w])
                if area > max_area:
                    max_area = area
                    res = [i, j, i+h, j+w]
    #返回找到的 DSSIM 最大区域的边界框坐标 res
    return res

2.3 混合方法:alpha_blending_func和poisson_blending_func

alpha混合: $I^{'}_F = I_F*M+I_S*(1-M)$ , $I^{'}_F$
Poisson 混合，使用无缝克隆算法将targetRgb融合到srcRgb

def alpha_blending_func(srcRgb, targetRgb, mask):
    '''Alpha blending function

    Argument:

    srcRgb: background image. type -> ndarray.
    targetRgb: target image. type -> ndarray.
    mask: blending mask. type -> ndarray.
    '''
    return (mask * targetRgb + (1 - mask) * srcRgb).astype(np.uint8)


def poisson_blending_func(srcRgb, targetRgb, mask, points):
    '''Poisson blending function

    Argument:

    srcRgb: background image. type -> ndarray.
    targetRgb: target image. type -> ndarray.
    mask: blending mask. type -> ndarray.
    points: blending position. type -> ndarray.
    '''

    points = np.array(points)
    points = points.reshape((-1, 2))
    center_x = (max(points[:, 0]) + min(points[:, 0])) / 2
    center_y = (max(points[:, 1]) + min(points[:, 1])) / 2
    center = (int(center_y), int(center_x))

    return cv2.seamlessClone(targetRgb, srcRgb, mask, center, cv2.NORMAL_CLONE)

2.4 get_align5p

对齐图像

def get_align5p(images, ld, rng, config, training=False):
    #images是[mfs_fake, srcRgb]
    config = config['crop_face']

    images, landmark = align_5p(
        images, ld=ld,
        face_width=config['face_width'], canvas_size=config['output_size'],
        scale=(rng.randn()*0.1+0.9 if training else config['scale']),
        translation=([
            rand_range(rng, -25, 25), rand_range(rng, -25, 25)
        ] if training else [0, 0]),
        rotation=(30*rand_range(rng, -1, 1)**3 if training else 0),
        sa=(rand_range(rng, .97, 1.03) if training and rng.rand() > 0.8 else 1),
        sb=(rand_range(rng, .97, 1.03) if training and rng.rand() > 0.8 else 1),
    )

    return images, landmark

2.4.1 align_5p

根据面部标志点对图像进行对齐和裁剪

def align_5p(
        images, ld, face_width, canvas_size,
        translation=[0, 0], rotation=0,
        scale=1, sa=1, sb=1
):
    '''crop face with landmark.

    images: input images. -> ndarray.
    ld: face landmark of input images. -> ndarray; shape -> (5, 2)
    face_width: face width ratio of the cropped images. -> float
    canvas_size: shape of the cropped face.
    return list of cropped images. -> list(ndarray)
    '''
    #从标志点数组 ld 中提取面部关键点
    nose_tip = ld[30]
    left_eye = np.mean(ld[36:42], axis=0).astype('int')
    right_eye = np.mean(ld[42:48], axis=0).astype('int')
    left_mouth, right_mouth = ld[48], ld[54]

    lm = np.array([left_eye, right_eye, nose_tip, left_mouth, right_mouth])
    #MEAN_FACE 是预定义的平均面部标志点
    mf = MEAN_FACE * scale
    #将平均面部标志点按比例缩放，并根据 face_width 和 canvas_size 调整
    mf = get_mean_face(mf, face_width, canvas_size)

    M1 = np.eye(3)
    #调用 get_align_transform 函数计算从 lm 到 mf 的仿射变换矩阵，并将结果填充到 M1 的前两行
    M1[:2] = get_align_transform(lm, mf)

    M2 = np.eye(3)
    #使用 cv2.getRotationMatrix2D 函数生成旋转矩阵（中心点为图像中心，旋转角度为 rotation），并将结果填充到 M2 的前两行
    M2[:2] = cv2.getRotationMatrix2D((canvas_size/2, canvas_size/2), rotation, 1)

    def stretch(va, vb, s):
        m = (va+vb)*0.5
        d = (va-vb)*0.5
        va[:] = m+d*s
        vb[:] = m-d*s

    mf = mf[[0, 1, 3, 4]].astype(np.float32)
    mf2 = mf.copy()
    stretch(mf2[0], mf2[1], sa)
    stretch(mf2[2], mf2[3], 1.0/sa)
    stretch(mf2[0], mf2[2], sb)
    stretch(mf2[1], mf2[3], 1.0/sb)

    mf2 += np.array(translation)

    M3 = cv2.getPerspectiveTransform(mf, mf2)

    M = M3.dot(M2).dot(M1)

    dshape = (canvas_size, canvas_size)
    images = [cv2.warpPerspective(img, M, dshape) for img in images]

    # warp landmark.
    ld = np.array(ld)
    ld = ld.dot(M[:, :2].T) + M[:, 2].T

    return images, ld[:, :2]

2.4.1.1 get_mean_face

def get_mean_face(mf, face_width, canvas_size):
#0.34967 是一个常量，用于归一化面部宽度
    ratio = face_width / (canvas_size * 0.34967)
    left_eye_pupil_y = mf[0][1]
    ratioy = (left_eye_pupil_y * ratio + 0.5) * (1 + 1.42)
    mf[:, 0] = (mf[:, 0] * ratio + 0.5) * canvas_size
    mf[:, 1] = (mf[:, 1] * ratio + 0.5) * canvas_size / ratioy
    #调整后的平均面部标志点数组 mf
    return mf

2.4.1.2 get_align_transform

def get_align_transform(lm, mf):
#计算平均面部标志点 mf 的 x 和 y 坐标的平均值 mx 和 my。
#计算输入面部标志点 lm 的 x 和 y 坐标的平均值 dmx 和 dmy。
    mx = mf[:, 0].mean()
    my = mf[:, 1].mean()
    dmx = lm[:, 0].mean()
    dmy = lm[:, 1].mean()
    #计算 mf 和 lm 中每个点相对于中心点的偏移量。
    ux = mf[:, 0] - mx
    uy = mf[:, 1] - my
    dux = lm[:, 0] - dmx
    duy = lm[:, 1] - dmy
    #旋转和平移所需的三个常量 c1、c2 和 c3
    c1 = (ux * dux + uy * duy).sum()
    c2 = (ux * duy - uy * dux).sum()
    c3 = (dux**2 + duy**2).sum()
    #仿射变换矩阵的旋转参数 a 和 b
    a = c1 / c3
    b = c2 / c3
    kx, ky = 1, 1
    #构造仿射变换矩阵 transform
    transform = np.zeros((2, 3))
    transform[0][0] = kx * a
    transform[0][1] = kx * b
    transform[0][2] = mx - kx * a * dmx - kx * b * dmy
    transform[1][0] = -ky * b
    transform[1][1] = ky * a
    transform[1][2] = my - ky * a * dmy + ky * b * dmx
    return transform

2.5 get_partial_bbox_gt

计算裁剪后的目标图像和源图像之间的 DSSIM 值。
如果 DSSIM 值大于某个小阈值的像素数超过一个预定值，则在 DSSIM 图上使用滑动窗口查找最大差异区域，并返回该区域的边界框。
如果未找到合适的区域，则返回 None。

def get_partial_bbox_gt(cutMfs, cutSrc, sliding_win):
    #计算裁剪后的目标图像和源图像之间的结构差异度量 (DSSIM)
    dssim = cal_dssim(cutMfs, cutSrc)
    if len(dssim[dssim > 1e-3]) > 40**2*3:
        #调用 _sliding_bbox 函数，在 DSSIM 图上使用滑动窗口查找最大差异区域，并返回该区域的边界框
        return _sliding_bbox(dssim, sliding_win[0], sliding_win[1])
    return None