文章目录
git地址:https://github.com/TMElyralab/MuseTalk
一、模型理解

1.1 训练方式
类似于stable diffusion的流程,原始人脸与mask唇部图片先经过训练好的Encoder转化为隐向量concat一起,经过Unet预测每一步生成的噪声,语音信息【./models/whisper/tiny.pt生成】作为一个条件机制加入到训练过程中。
1.2 Loss
L 1 L_1 L1预测的隐向量和真实图片的隐向量的距离
L 2 L_2 L2原图和生成图之间的差异
二、测试代码流程
2.1 main函数
总流程:
- 输入视频/图片、音频
- 对于音频,调用Audio2Feature类进行音频编码、分块(第i帧图片对应音频块[i-2,i+2])
- 如果输入的是视频,分帧、提取提取人脸关键点、提取人脸边界框(手动mask人脸下半部分)
- 模型生成每一帧图片
- 裁剪图片与背景合成每一帧
- FFmpeg 生成视频
def main(args):
global pe
if args.use_float16 is True:
pe = pe.half()
vae.vae = vae.vae.half()
unet.model = unet.model.half()
inference_config = OmegaConf.load(args.inference_config)
print(inference_config)
#{'task_0': {'video_path': './data/video/yongen.mp4', 'audio_path': './data/audio/yongen.wav'}, 'task_1': {'video_path': './data/video/sun.mp4', 'audio_path': './data/audio/sun.wav', 'bbox_shift': -7}}
for task_id in inference_config:
video_path = inference_config[task_id]["video_path"]
audio_path = inference_config[task_id]["audio_path"]
bbox_shift = inference_config[task_id].get("bbox_shift", args.bbox_shift)
input_basename = os.path.basename(video_path).split('.')[0]
audio_basename = os.path.basename(audio_path).split('.')[0]
output_basename = f"{
input_basename}_{
audio_basename}"
result_img_save_path = os.path.join(args.result_dir, output_basename) # related to video & audio inputs
crop_coord_save_path = os.path.join(result_img_save_path, input_basename+".pkl") # only related to video input
os.makedirs(result_img_save_path,exist_ok =True)
if args.output_vid_name is None:
#./results/yongen_yongen.mp4'
output_vid_name = os.path.join(args.result_dir, output_basename+".mp4")
else:
output_vid_name = os.path.join(args.result_dir, args.output_vid_name)
############################################## extract frames from source video ##############################################
#如果输入是一个视频,切帧
if get_file_type(video_path)=="video":
save_dir_full = os.path.join(args.result_dir, input_basename)
os.makedirs(save_dir_full,exist_ok = True)
#提取视频的每一帧,总帧数:帧率*时长s
cmd = f"ffmpeg -v fatal -i {
video_path} -start_number 0 {
save_dir_full}/%08d.png"
os.system(cmd)
input_img_list = sorted(glob.glob(os.path.join(save_dir_full, '*.[jpJP][pnPN]*[gG]')))
fps = get_video_fps(video_path)
elif get_file_type(video_path)=="image":
input_img_list = [video_path, ]
fps = args.fps
elif os.path.isdir(video_path): # input img folder
input_img_list = glob.glob(os.path.join(video_path, '*.[jpJP][pnPN]*[gG]'))
input_img_list = sorted(input_img_list, key=lambda x: int(os.path.splitext(os.path.basename(x))[0]))
fps = args.fps
else:
raise ValueError(f"{
video_path} should be a video file, an image file or a directory of images")
#print(input_img_list)
############################################## extract audio feature ##############################################
#调用Audio2Feature类进行音频编码
whisper_feature = audio_processor.audio2feat(audio_path)
#对音频分块
whisper_chunks = audio_processor.feature2chunks(feature_array=whisper_feature,fps=fps)
############################################## preprocess input image ##############################################
if os.path

最低0.47元/天 解锁文章
4416






