论文讲解请看:https://blog.youkuaiyun.com/JustWantToLearn/article/details/138758033
代码链接:https://github.com/megvii-research/CADDM
在这里,我们简要描述算法流程,着重分析模型搭建细节,以及为什么要这样搭建。
part 1:数据集准备 本文
part 2: 数据集加载,包含 Multi-scale Facial Swap(MFS) 模块:https://blog.youkuaiyun.com/JustWantToLearn/article/details/139092687
part 3:训练过程,ADM模块 https://blog.youkuaiyun.com/JustWantToLearn/article/details/139116455
文章目录
环境准备
- linux
- Python 3 >= 3.6
- Pytorch >= 1.6.0
- OpenCV >= 4.4.0
- Scipy >= 1.4.1
- NumPy >= 1.19.5
在安装dlib库的时候卡了很久,先安装cmake,再去安装dlib。如果不成功,直接源码安装
git clone https://github.com/davisking/dlib.git
cd dlib
mkdir build
cd build
cmake ..
cmake --build . --config Release
cmake --build . --config Release --target install
cd ..
python setup.py install
数据准备 提取ff++数据集的ldm
1、下载FF++数据集
根据链接https://github.com/ondyari/FaceForensics/tree/master提示下载
放入特定文件夹中
.
└── data
└── FaceForensics++
├── original_sequences
│ └── youtube
│ └── raw
│ └── videos
│ └── *.mp4
├── manipulated_sequences
│ ├── Deepfakes
│ └── raw
│ └── videos
│ └── *.mp4
│ ├── Face2Face
│ ...
│ ├── FaceSwap
│ ...
│ ├── NeuralTextures
│ ...
│ ├── FaceShifter
│ ...
2、提取ldm
python lib/extract_frames_ldm_ff++.py
2.1 定义路径,函数
#extract_frames_ldm_ff++.py
#!/usr/bin/env python3
from glob import glob
import os
import cv2
from tqdm import tqdm
import numpy as np
import dlib
import json
import argparse
from imutils import face_utils
#定义路径
VIDEO_PATH = "./CADDM/data/FaceForensics++"
SAVE_IMGS_PATH = "./test_images"
#下载好的形状预测模型
PREDICTOR_PATH = "./CADDM/lib/shape_predictor_81_face_landmarks.dat"
DATASETS = {'Original', 'FaceSwap', 'FaceShifter', 'Face2Face', 'Deepfakes', 'NeuralTextures'}
COMPRESSION = {'raw'}
NUM_FRAMES = 1
IMG_META_DICT = dict()
def parse_labels(video_path):
label = None
if "original" in video_path:
label = 0
else:
label = 1
return label
def parse_source_save_path(save_path):
source_save_path = None
if "original" in save_path:
source_save_path = save_path
else:
img_meta = save_path.split('/')
source_target_index = img_meta[-1]
source_index = source_target_index.split('_')[0]
manipulation_name = img_meta[-4]
original_name = "youtube"
source_save_path = save_path.replace(
"manipulated_sequences", "original_sequences"
).replace(
manipulation_name, original_name
).replace(
source_target_index, source_index
)
return source_save_path
2.2 main函数 主函数
def main():
# 初始化dlib的面部检测器(HOG模型)
face_detector = dlib.get_frontal_face_detector()
# 加载形状预测模型
face_predictor = dlib.shape_predictor(PREDICTOR_PATH)
#遍历文件{'Original', 'FaceSwap', 'FaceShifter', 'Face2Face', 'Deepfakes', 'NeuralTextures'}
for dataset in DATASETS:
for comp in COMPRESSION:
# 加载文件夹下所有mp4文件
movies_path_list = parse_video_path(dataset, comp)
n_sample = len(movies_path_list)
for i in tqdm(range(n_sample)):
#使用 replace 方法生成保存视频帧的路径
save_path_per_video = movies_path_list[i].replace(
VIDEO_PATH, SAVE_IMGS_PATH
).replace('.mp4', '').replace("/videos", "/frames")
preprocess_video(
movies_path_list[i], save_path_per_video,
face_detector, face_predictor
)
with open(f"{SAVE_IMGS_PATH}/ldm.json", 'w') as f:
json.dump(IMG_META_DICT, f)
if __name__ == '__main__':
main()
# vim: ts=4 sw=4 sts=4 expandtab
2.3 parse_video_path函数 读取所有video
def parse_video_path(dataset, compression):
# this path setting follows FF++ dataset
if dataset == 'Original':
dataset_path = f'{VIDEO_PATH}/original_sequences/youtube/{compression}/videos/'
elif dataset in ['FaceShifter', 'Face2Face', 'Deepfakes', 'FaceSwap', 'NeuralTextures']:
dataset_path = f'{VIDEO_PATH}/manipulated_sequences/{dataset}/{compression}/videos/'
else:
raise NotImplementedError
# get all videos under the specific manipulated/original sequences
#使用 glob 模块查找所有以 .mp4 结尾的视频文件
movies_path_list = sorted(glob(dataset_path+'*.mp4'))
print("{} : videos are exist in {}".format(len(movies_path_list), dataset))
return movies_path_list
2.4 preprocess_video函数,检测面部并保存地标
def preprocess_video(video_path, save_path, face_detector, face_predictor):
# save the video meta info here
video_dict = dict()
# get the labels
#真实视频标签为0,假视频标签1
label = parse_labels(video_path)
# 获取原始视频id,如果造假视频id 110_043,则source id 110,原始视频不变
#'./manipulated_sequences/FaceSwap/raw/frames/110_043' --》'./original_sequences/youtube/raw/frames/110'
source_save_path = parse_source_save_path(save_path)
# prepare the save path
os.makedirs(save_path, exist_ok=True)
# read the video and prepare the sampled index
#使用OpenCV的VideoCapture类打开视频文件
cap_video = cv2.VideoCapture(video_path)
#获取视频的总帧数
frame_count_video = int(cap_video.get(cv2.CAP_PROP_FRAME_COUNT))
#使用numpy.linspace生成一个等间隔的整数数组,表示要提取的帧的索引,NUM_FRAMES为1
#这里只保存了每个视频第一帧图片
frame_idxs = np.linspace(0, frame_count_video - 1, NUM_FRAMES, endpoint=True, dtype=int)
# process each frame
# 循环遍历视频的每一帧
for cnt_frame in range(frame_count_video):
# cap_video.read() 读取当前帧并返回读取状态ret和帧数据frame
ret, frame = cap_video.read()
height, width = frame.shape[:-1]
if not ret:
tqdm.write('Frame read {} Error! : {}'.format(cnt_frame, os.path.basename(video_path)))
continue
# 检查当前帧索引是否在要提取的帧索引数组中。如果不是,跳过此帧
if cnt_frame not in frame_idxs:
continue
# 将当前帧从BGR颜色空间转换为RGB颜色空间,以便与dlib的面部检测器兼容
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# 使用dlib的面部检测器检测当前帧中的面部,1 表示检测器将图像进行上采样一次,以检测更小的面部
faces = face_detector(frame, 1)
if len(faces) == 0:
tqdm.write('No faces in {}:{}'.format(cnt_frame, os.path.basename(video_path)))
continue
# 一个用于保存面部特征点,另一个用于保存检测到的面部大小
landmarks = list() # save the landmark
size_list = list() # save the size of the detected face
for face_idx in range(len(faces)):
landmark = face_predictor(frame, faces[face_idx])
landmark = face_utils.shape_to_np(landmark)
x0, y0 = landmark[:, 0].min(), landmark[:, 1].min()
x1, y1 = landmark[:, 0].max(), landmark[:, 1].max()
face_s = (x1 - x0) * (y1 - y0)
size_list.append(face_s)
landmarks.append(landmark)
# save the landmark with the biggest face
#若一张图片上出现多张人脸,选择人脸特征最大的脸
# 将所有面部特征点数组连接成一个NumPy数组,并根据面部大小排序,选择最大的面部特征点。
landmarks = np.concatenate(landmarks).reshape((len(size_list),)+landmark.shape)
landmarks = landmarks[np.argsort(np.array(size_list))[::-1]][0]
# save the meta info of the video
# 将最大的面部特征点、帧路径和标签保存到video_dict字典中
video_dict['landmark'] = landmarks.tolist()
video_dict['source_path'] = f"{source_save_path}/frame_{cnt_frame}"
video_dict['label'] = label
IMG_META_DICT[f"{save_path}/frame_{cnt_frame}"] = video_dict
# save one frame
# 将帧从RGB颜色空间转换回BGR颜色空间,构建保存帧图像的路径
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
image_path = f"{save_path}/frame_{cnt_frame}.png"
cv2.imwrite(image_path, frame)
cap_video.release()
return
3、结果
保存每个视频的第一帧,ldm.json保存每个视频的第一帧人脸81个关键点,原始人脸图片地址,label
3.1 在提取ldm时很多图片并没有检测出ldm
这样损失了很多数据,不确定作者数据这里有没有再做一些额外的处理