from pathlib import Path
import torch
import argparse
import os
import cv2
import numpy as np
import time
import copy
from hamer.configs import CACHE_DIR_HAMER
from hamer.models import HAMER, download_models, load_hamer, DEFAULT_CHECKPOINT
from hamer.utils import recursive_to
from hamer.datasets.vitdet_dataset import ViTDetDataset, DEFAULT_MEAN, DEFAULT_STD
from hamer.utils.renderer import Renderer, cam_crop_to_full
LIGHT_BLUE = (0.65098039, 0.74117647, 0.85882353)
from vitpose_model import ViTPoseModel
def setup_models():
"""设置和加载所有需要的模型"""
# Download and load checkpoints
download_models(CACHE_DIR_HAMER)
model, model_cfg = load_hamer(DEFAULT_CHECKPOINT)
# Setup HaMeR model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
model.eval()
# Load detector
from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy
from detectron2.config import LazyConfig
import hamer
from detectron2 import model_zoo
from detectron2.config import get_cfg
detectron2_cfg = model_zoo.get_config('new_baselines/mask_rcnn_regnety_4gf_dds_FPN_400ep_LSJ.py', trained=True)
detectron2_cfg.model.roi_heads.box_predictor.test_score_thresh = 0.5
detectron2_cfg.model.roi_heads.box_predictor.test_nms_thresh = 0.4
detector = DefaultPredictor_Lazy(detectron2_cfg)
# Keypoint detector
cpm = ViTPoseModel(device)
# Setup the renderer
renderer = Renderer(model_cfg, faces=model.mano.faces)
return model, model_cfg, detector, cpm, renderer, device
def process_frame(img_cv2, model, model_cfg, detector, cpm, renderer, device, prev_results=None):
"""处理单帧图像"""
# 清理GPU内存
if torch.cuda.is_available():
torch.cuda.empty_cache()
# 缩小图像以提高处理速度
height, width = img_cv2.shape[:2]
max_dimension = 320
if max(height, width) > max_dimension:
scale = max_dimension / max(height, width)
new_width = int(width * scale)
new_height = int(height * scale)
img_cv2 = cv2.resize(img_cv2, (new_width, new_height))
img = img_cv2.copy()[:, :, ::-1]
# 检测人体
det_out = detector(img_cv2)
det_instances = det_out['instances']
valid_idx = (det_instances.pred_classes == 0) & (det_instances.scores > 0.3)
pred_bboxes = det_instances.pred_boxes.tensor[valid_idx].cpu().numpy()
pred_scores = det_instances.scores[valid_idx].cpu().numpy()
if len(pred_bboxes) == 0:
# 如果没有检测到人体,尝试使用之前的手部结果
if prev_results and 'hand_verts' in prev_results:
return render_hands_on_frame(img_cv2, prev_results['hand_verts'], prev_results['hand_cam_t'],
prev_results['scaled_focal_length'], renderer, prev_results['boxes'],
prev_results['is_right']), prev_results
return img_cv2, None
# 检测关键点
vitposes_out = cpm.predict_pose(
img,
[np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)],
)
bboxes = []
is_right = []
# 检测手部
for vitposes in vitposes_out:
left_hand_keyp = vitposes['keypoints'][-42:-21]
right_hand_keyp = vitposes['keypoints'][-21:]
# 右手检测
keyp = right_hand_keyp
valid = keyp[:, 2] > 0.3
if sum(valid) > 2:
bbox = [keyp[valid, 0].min(), keyp[valid, 1].min(), keyp[valid, 0].max(), keyp[valid, 1].max()]
bboxes.append(bbox)
is_right.append(1)
if len(bboxes) == 0:
# 如果没有检测到手部,尝试使用之前的手部结果
if prev_results and 'hand_verts' in prev_results:
return render_hands_on_frame(img_cv2, prev_results['hand_verts'], prev_results['hand_cam_t'],
prev_results['scaled_focal_length'], renderer, prev_results['boxes'],
prev_results['is_right']), prev_results
return img_cv2, None
boxes = np.stack(bboxes)
right = np.stack(is_right)
# 手部重建
dataset = ViTDetDataset(model_cfg, img_cv2, boxes, right, rescale_factor=1)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
all_verts = []
all_cam_t = []
scaled_focal_length = None
for batch in dataloader:
batch = recursive_to(batch, device)
with torch.no_grad():
out = model(batch)
# 处理相机参数
multiplier = (2 * batch['right'] - 1)
pred_cam = out['pred_cam']
pred_cam[:, 1] = multiplier * pred_cam[:, 1]
box_center = batch["box_center"].float()
box_size = batch["box_size"].float()
img_size = batch["img_size"].float()
scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max()
pred_cam_t_full = cam_crop_to_full(pred_cam, box_center, box_size, img_size, scaled_focal_length).detach().cpu().numpy()
# 收集顶点和相机参数
batch_size = batch['img'].shape[0]
for n in range(batch_size):
verts = out['pred_vertices'][n].detach().cpu().numpy()
is_right_flag = batch['right'][n].cpu().numpy()
verts[:, 0] = (2 * is_right_flag - 1) * verts[:, 0]
cam_t = pred_cam_t_full[n]
all_verts.append(verts)
all_cam_t.append(cam_t)
# 渲染手部到图像
if all_verts:
rendered_frame = render_hands_on_frame(img_cv2, all_verts, all_cam_t, scaled_focal_length, renderer, boxes, right)
# 保存当前结果供后续帧使用
current_results = {
'hand_verts': all_verts,
'hand_cam_t': all_cam_t,
'scaled_focal_length': scaled_focal_length,
'boxes': boxes,
'is_right': right
}
return rendered_frame, current_results
else:
return img_cv2, None
def render_hands_on_frame(img_cv2, all_verts, all_cam_t, scaled_focal_length, renderer, boxes=None, is_right=None):
"""将手部渲染到帧上"""
rendered_frame = img_cv2.copy()
for i, (verts, cam_t) in enumerate(zip(all_verts, all_cam_t)):
try:
# 使用正确的render_rgba参数
render_img = renderer.render_rgba(
verts,
cam_t,
mesh_base_color=LIGHT_BLUE,
focal_length=scaled_focal_length,
)
# 调整渲染图像尺寸以匹配原始图像
if render_img.shape[:2] != rendered_frame.shape[:2]:
render_img = cv2.resize(render_img, (rendered_frame.shape[1], rendered_frame.shape[0]))
# 将渲染结果叠加到原始图像
overlay = render_img[:, :, :3]
alpha = render_img[:, :, 3:4]
rendered_frame = (rendered_frame * (1 - alpha) + overlay * 255 * alpha).astype(np.uint8)
except Exception as e:
print(f"渲染失败: {e}")
# 如果渲染失败,至少绘制边界框
if boxes is not None and i < len(boxes):
color = (0, 255, 0) if (is_right is not None and i < len(is_right) and is_right[i]) else (255, 0, 0)
x1, y1, x2, y2 = boxes[i]
cv2.rectangle(rendered_frame, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
hand_type = "Right" if (is_right is not None and i < len(is_right) and is_right[i]) else "Left"
cv2.putText(rendered_frame, f'{hand_type} Hand', (int(x1), int(y1)-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
return rendered_frame
def main():
parser = argparse.ArgumentParser(description='HaMeR 摄像头实时演示')
parser.add_argument('--camera_id', type=int, default=0, help='摄像头ID (默认: 0)')
parser.add_argument('--frame_skip', type=int, default=3, help='跳帧处理以提高速度 (默认: 每2帧处理1次)')
parser.add_argument('--show_fps', action='store_true', default=False, help='显示FPS')
parser.add_argument('--use_cpu', action='store_true', default=False, help='使用CPU模式')
args = parser.parse_args()
# 设置设备
if args.use_cpu:
os.environ["CUDA_VISIBLE_DEVICES"] = ""
device = torch.device('cpu')
print("使用CPU模式")
else:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
print("正在加载模型...")
try:
model, model_cfg, detector, cpm, renderer, device = setup_models()
print("模型加载完成!")
except Exception as e:
print(f"模型加载失败: {e}")
return
# 打开摄像头
cap = cv2.VideoCapture(args.camera_id)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 320)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 240)
if not cap.isOpened():
print("无法打开摄像头")
for i in range(1, 5):
cap = cv2.VideoCapture(i)
if cap.isOpened():
print(f"找到摄像头 ID: {i}")
break
else:
print("未找到可用摄像头")
return
# 创建窗口
cv2.namedWindow('HaMeR - 实时手部重建', cv2.WINDOW_NORMAL)
cv2.resizeWindow('HaMeR - 实时手部重建', 320, 240)
frame_count = 0
fps_time = time.time()
fps = 0.0
prev_results = None # 保存上一帧的结果
last_processed_frame = None # 保存上一帧处理后的图像
print("开始实时手部重建...")
print("按 'q' 退出")
print("按 's' 保存当前帧")
while True:
ret, frame = cap.read()
if not ret:
print("无法读取摄像头帧")
break
frame_count += 1
# 处理当前帧或使用之前的结果
if frame_count % args.frame_skip == 0:
try:
# 处理当前帧
processed_frame, current_results = process_frame(
frame.copy(), model, model_cfg, detector, cpm, renderer, device, prev_results
)
# 更新结果
if current_results is not None:
prev_results = current_results
last_processed_frame = processed_frame
hand_count = len(current_results['hand_verts']) if current_results and 'hand_verts' in current_results else 0
# 计算FPS
current_time = time.time()
fps = 1.0 / (current_time - fps_time)
fps_time = current_time
except Exception as e:
print(f"处理帧时出错: {e}")
# 出错时使用上一帧的结果
if last_processed_frame is not None:
processed_frame = last_processed_frame
else:
processed_frame = frame
hand_count = 0
else:
# 非处理帧:使用之前的手部结果渲染当前帧
if prev_results is not None and 'hand_verts' in prev_results:
processed_frame = render_hands_on_frame(
frame.copy(),
prev_results['hand_verts'],
prev_results['hand_cam_t'],
prev_results['scaled_focal_length'],
renderer,
prev_results.get('boxes'),
prev_results.get('is_right')
)
last_processed_frame = processed_frame
hand_count = len(prev_results['hand_verts'])
else:
processed_frame = frame
hand_count = 0
# 显示FPS和信息
if args.show_fps:
cv2.putText(processed_frame, f'FPS: {fps:.1f}', (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
cv2.putText(processed_frame, f'Hands: {hand_count}', (10, 60),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
status = "Processing" if frame_count % args.frame_skip == 0 else "Reusing"
cv2.putText(processed_frame, f'Status: {status}', (10, 90),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
# 显示结果
try:
cv2.imshow('HaMeR - 实时手部重建', processed_frame)
except Exception as e:
print(f"显示图像时出错: {e}")
break
# 键盘控制
key = cv2.waitKey(1) & 0xFF
if key == ord('q'):
break
elif key == ord('s'):
timestamp = int(time.time())
cv2.imwrite(f'saved_frame_{timestamp}.jpg', processed_frame)
print(f"帧已保存: saved_frame_{timestamp}.jpg")
# 释放资源
cap.release()
cv2.destroyAllWindows()
print("程序结束")
if __name__ == '__main__':
main()这是原代码