hrnet训练的rknn模型结合目标检测进行关键点识别的更准确前向推理

诶尔法Alpha

已于 2025-03-12 14:54:40 修改

阅读量366

点赞数 1

CC 4.0 BY-SA版权

分类专栏： yolo项目关键点识别文章标签：目标检测人工智能计算机视觉

于 2025-03-06 09:10:07 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_45354497/article/details/146049968

yolo项目同时被 2 个专栏收录

12 篇文章

订阅专栏

关键点识别

5 篇文章

订阅专栏

环境搭建或者模型转换之类的可以参考前面的文章，这里直接放代码。

首先是hrnet的推理检测函数hrnet_inference.py：

import os
import urllib
import traceback
import time
import sys
import warnings

import numpy as np
import cv2

# RKNN_MODEL = "hrnet_w32_macaque_256x192-f7e9e04f_20230208.rknn"

mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
QUANTIZE_ON = True


def bbox_xywh2cs(bbox, aspect_ratio, padding=1., pixel_std=200.):
    """Transform the bbox format from (x,y,w,h) into (center, scale)

    Args:
        bbox (ndarray): Single bbox in (x, y, w, h)
        aspect_ratio (float): The expected bbox aspect ratio (w over h)
        padding (float): Bbox padding factor that will be multilied to scale.
            Default: 1.0
        pixel_std (float): The scale normalization factor. Default: 200.0

    Returns:
        tuple: A tuple containing center and scale.
        - np.ndarray[float32](2,): Center of the bbox (x, y).
        - np.ndarray[float32](2,): Scale of the bbox w & h.
    """

    x, y, w, h = bbox[:4]
    center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32)

    if w > aspect_ratio * h:
        h = w * 1.0 / aspect_ratio
    elif w < aspect_ratio * h:
        w = h * aspect_ratio

    scale = np.array([w, h], dtype=np.float32) / pixel_std
    scale = scale * padding

    return center, scale


def rotate_point(pt, angle_rad):
    """Rotate a point by an angle.

    Args:
        pt (list[float]): 2 dimensional point to be rotated
        angle_rad (float): rotation angle by radian

    Returns:
        list[float]: Rotated point.
    """
    assert len(pt) == 2
    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
    new_x = pt[0] * cs - pt[1] * sn
    new_y = pt[0] * sn + pt[1] * cs
    rotated_pt = [new_x, new_y]

    return rotated_pt


def _get_3rd_point(a, b):
    """To calculate the affine matrix, three pairs of points are required. This
    function is used to get the 3rd point, given 2D points a & b.

    The 3rd point is defined by rotating vector `a - b` by 90 degrees
    anticlockwise, using b as the rotation center.

    Args:
        a (np.ndarray): point(x,y)
        b (np.ndarray): point(x,y)

    Returns:
        np.ndarray: The 3rd point.
    """
    assert len(a) == 2
    assert len(b) == 2
    direction = a - b
    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)

    return third_pt


def get_affine_transform(center,
                         scale,
                         rot,
                         output_size,
                         shift=(0., 0.),
                         inv=False):
    """Get the affine transform matrix, given the center/scale/rot/output_size.

    Args:
        center (np.ndarray[2, ]): Center of the bounding box (x, y).
        scale (np.ndarray[2, ]): Scale of the bounding box
            wrt [width, height].
        rot (float): Rotation angle (degree).
        output_size (np.ndarray[2, ] | list(2,)): Size of the
            destination heatmaps.
        shift (0-100%): Shift translation ratio wrt the width/height.
            Default (0., 0.).
        inv (bool): Option to inverse the affine transform direction.
            (inv=False: src->dst or inv=True: dst->src)

    Returns:
        np.ndarray: The transform matrix.
    """
    assert len(center) == 2
    assert len(scale) == 2
    assert len(output_size) == 2
    assert len(shift) == 2

    # pixel_std is 200.
    scale_tmp = scale * 200.0

    shift = np.array(shift)
    src_w = scale_tmp[0]
    dst_w = output_size[0]
    dst_h = output_size[1]

    rot_rad = np.pi * rot / 180
    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
    dst_dir = np.array([0., dst_w * -0.5])

    src = np.zeros((3, 2), dtype=np.float32)
    src[0, :] = center + scale_tmp * shift
    src[1, :] = center + src_dir + scale_tmp * shift
    src[2, :] = _get_3rd_point(src[0, :], src[1, :])

    dst = np.zeros((3, 2), dtype=np.float32)
    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])

    if inv:
        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
    else:
        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))

    return trans


def bbox_xyxy2xywh(bbox_xyxy):
    """Transform the bbox format from x1y1x2y2 to xywh.

    Args:
        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
            (n, 5). (left, top, right, bottom, [score])

    Returns:
        np.ndarray: Bounding boxes (with scores),
          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
    """
    bbox_xywh = bbox_xyxy.copy()
    bbox_xywh[:, 2] = bbox_xywh[:, 2] - bbox_xywh[:, 0]
    bbox_xywh[:, 3] = bbox_xywh[:, 3] - bbox_xywh[:, 1]

    return bbox_xywh


def _get_max_preds(heatmaps):
    """Get keypoint predictions from score maps.

    Note:
        batch_size: N
        num_keypoints: K
        heatmap height: H
        heatmap width: W

    Args:
        heatmaps (np.ndarray[N, K, H, W]): model predicted heatmaps.

    Returns:
        tuple: A tuple containing aggregated results.

        - preds (np.ndarray[N, K, 2]): Predicted keypoint location.
        - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints.
    """
    assert isinstance(heatmaps,
                      np.ndarray), ('heatmaps should be numpy.ndarray')
    assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'

    N, K, _, W = heatmaps.shape
    heatmaps_reshaped = heatmaps.reshape((N, K, -1))
    idx = np.argmax(heatmaps_reshaped, 2).reshape((N, K, 1))
    maxvals = np.amax(heatmaps_reshaped, 2).reshape((N, K, 1))

    preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
    preds[:, :, 0] = preds[:, :, 0] % W
    preds[:, :, 1] = preds[:, :, 1] // W

    preds = np.where(np.tile(maxvals, (1, 1, 2)) > 0.0, preds, -1)
    return preds, maxvals


def transform_preds(coords, center, scale, output_size, use_udp=False):
    """Get final keypoint predictions from heatmaps and apply scaling and
    translation to map them back to the image.

    Note:
        num_keypoints: K

    Args:
        coords (np.ndarray[K, ndims]):

            * If ndims=2, corrds are predicted keypoint location.
            * If ndims=4, corrds are composed of (x, y, scores, tags)
            * If ndims=5, corrds are composed of (x, y, scores, tags,
              flipped_tags)

        center (np.ndarray[2, ]): Center of the bounding box (x, y).
        scale (np.ndarray[2, ]): Scale of the bounding box
            wrt [width, height].
        output_size (np.ndarray[2, ] | list(2,)): Size of the
            destination heatmaps.
        use_udp (bool): Use unbiased data processing

    Returns:
        np.ndarray: Predicted coordinates in the images.
    """
    assert coords.shape[1] in (2, 4, 5)
    assert len(center) == 2
    assert len(scale) == 2
    assert len(output_size) == 2

    # Recover the scale which is normalized by a factor of 200.
    scale = scale * 200.0

    if use_udp:
        scale_x = scale[0] / (output_size[0] - 1.0)
        scale_y = scale[1] / (output_size[1] - 1.0)
    else:
        scale_x = scale[0] / output_size[0]
        scale_y = scale[1] / output_size[1]

    target_coords = np.ones_like(coords)
    target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[0] * 0.5
    target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[1] * 0.5

    return target_coords


def keypoints_from_heatmaps(heatmaps,
                            center,
                            scale,
                            unbiased=False,
                            post_process='default',
                            kernel=11,
                            valid_radius_factor=0.0546875,
                            use_udp=False,
                            target_type='GaussianHeatmap'):
    # Avoid being affected
    heatmaps = heatmaps.copy()

    N, K, H, W = heatmaps.shape
    preds, maxvals = _get_max_preds(heatmaps)
    # add +/-0.25 shift to the predicted locations for higher acc.
    for n in range(N):
        for k in range(K):
            heatmap = heatmaps[n][k]
            px = int(preds[n][k][0])
            py = int(preds[n][k][1])
            if 1 < px < W - 1 and 1 < py < H - 1:
                diff = np.array([
                    heatmap[py][px + 1] - heatmap[py][px - 1],
                    heatmap[py + 1][px] - heatmap[py - 1][px]
                ])
                preds[n][k] += np.sign(diff) * .25
                if post_process == 'megvii':
                    preds[n][k] += 0.5

    # Transform back to the image
    for i in range(N):
        preds[i] = transform_preds(
            preds[i], center[i], scale[i], [W, H], use_udp=use_udp)

    if post_process == 'megvii':
        maxvals = maxvals / 255.0 + 0.5

    return preds, maxvals


def decode(output, center, scale, score_, batch_size=1):
    c = np.zeros((batch_size, 2), dtype=np.float32)
    s = np.zeros((batch_size, 2), dtype=np.float32)
    score = np.ones(batch_size)
    for i in range(batch_size):
        c[i, :] = center
        s[i, :] = scale

        #score[i] = np.array(score_).reshape(-1)
        score[i] = score_

    preds, maxvals = keypoints_from_heatmaps(
        output,
        c,
        s,
        False,
        'default',
        11,
        0.0546875,
        False,
        'GaussianHeatmap'
    )

    all_preds = np.zeros((batch_size, preds.shape[1], 3), dtype=np.float32)
    all_boxes = np.zeros((batch_size, 6), dtype=np.float32)
    all_preds[:, :, 0:2] = preds[:, :, 0:2]
    all_preds[:, :, 2:3] = maxvals
    all_boxes[:, 0:2] = c[:, 0:2]
    all_boxes[:, 2:4] = s[:, 0:2]
    all_boxes[:, 4] = np.prod(s * 200.0, axis=1)
    all_boxes[:, 5] = score
    result = {}

    result['preds'] = all_preds
    result['boxes'] = all_boxes

    return result


def draw(bgr, predict_dict, skeleton):
    bboxes = predict_dict["boxes"]
    for box in bboxes:
        cv2.rectangle(bgr, (int(box[0]), int(box[1])), (int(box[0]) + int(box[2]), int(box[1]) + int(box[3])),
                      (255, 0, 0))

    all_preds = predict_dict["preds"]
    for all_pred in all_preds:
        for x, y, s in all_pred:
            cv2.circle(bgr, (int(x), int(y)), 3, (0, 255, 120), -1)
        for sk in skeleton:
            x0 = int(all_pred[sk[0]][0])
            y0 = int(all_pred[sk[0]][1])
            x1 = int(all_pred[sk[1]][0])
            y1 = int(all_pred[sk[1]][1])
            cv2.line(bgr, (x0, y0), (x1, y1), (0, 255, 0), 1)
    cv2.imwrite("result.jpg", bgr)





def myFunc00(rknn_lite, IMG, yolo_box):
    if yolo_box is None:
        return IMG
    # bbox = [450, 150, 1100, 550, 0.99]
    bbox = [int(yolo_box[0]), int(yolo_box[1]), int(yolo_box[2]), int(yolo_box[3]), 0.99]

    # bbox = [1428, 723, 1421, 847, 0.99]
    image_size = [384, 288]
    # img = src_img
    img = cv2.cvtColor(IMG, cv2.COLOR_BGR2RGB)  # hwc rgb
    aspect_ratio = image_size[0] / image_size[1]
    img_height = img.shape[0]
    img_width = img.shape[1]
    padding = 1.25
    pixel_std = 200
    center, scale = bbox_xywh2cs(
        bbox,
        aspect_ratio,
        padding,
        pixel_std)
    trans = get_affine_transform(center, scale, 0, image_size)
    img = cv2.warpAffine(  # 旋转后加入了黑边 最后生成的点的坐标也要对齐
        img,
        trans, (int(image_size[0]), int(image_size[1])),
        flags=cv2.INTER_LINEAR)
    img = np.transpose(img, (2, 0, 1)).astype(np.float32)  # chw rgb
    # outputs = rknn.inference(inputs=[img], data_type=None, data_format="nchw")[0]
    # img[0, ...] = ((img[0, ...] / 255.0) - 0.485) / 0.229
    # img[1, ...] = ((img[1, ...] / 255.0) - 0.456) / 0.224
    # img[2, ...] = ((img[2, ...] / 255.0) - 0.406) / 0.225
    img = np.transpose(img, (1, 2, 0)).astype(np.float32)  # chw rgb
    # img = img.reshape(1,256,192,3)
    # Inference
    print("--> Running model")
    start = time.time()
    img = np.expand_dims(img, axis=0)
    outputs = rknn_lite.inference(inputs=[img])[0]
    end = time.time()
    # 计算运行时间
    runTime = end - start
    runTime_ms = runTime * 1000
    # 输出运行时间
    print("运行时间：", runTime_ms, "毫秒")
    predict_dict = decode(outputs, center, scale, bbox[-1])
    skeleton = [[15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8], [7, 9],
                [8, 10], [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]]
    draw(IMG, predict_dict, skeleton)
    return IMG

然后是yolo推理检测函数yolo_inference.py：

from copy import copy
import time
import numpy as np
import cv2
from rknnlite.api import RKNNLite

# BOX = (450, 150, 1100, 550)
BOX = (170, 80, 740, 1360)
x, y, w, h = BOX

OBJ_THRESH = 0.25
NMS_THRESH = 0.45
IMG_SIZE = (320, 320)

CLASSES = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
           'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
           'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
           'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard',
           'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
           'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
           'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
           'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
           'hair drier', 'toothbrush']

anchors = [[[10, 13], [16, 30], [33, 23]],
           [[30, 61], [62, 45], [59, 119]],
           [[116, 90], [156, 198], [373, 326]]]

class Letter_Box_Info():
    def __init__(self, shape, new_shape, w_ratio, h_ratio, dw, dh, pad_color) -> None:
        self.origin_shape = shape
        self.new_shape = new_shape
        self.w_ratio = w_ratio
        self.h_ratio = h_ratio
        self.dw = dw
        self.dh = dh
        self.pad_color = pad_color

def box_process(position, anchors):
    grid_h, grid_w = position.shape[2:4]
    col, row = np.meshgrid(np.arange(0, grid_w), np.arange(0, grid_h))  # (80, 80) (80, 80)
    col = col.reshape(1, 1, grid_h, grid_w)    # (1, 1, 80, 80)
    row = row.reshape(1, 1, grid_h, grid_w)
    grid = np.concatenate((col, row), axis=1)  # (1, 2, 80, 80)
    stride = np.array([IMG_SIZE[1]//grid_h, IMG_SIZE[0]//grid_w]).reshape(1,2,1,1)  # 8 8

    col = col.repeat(len(anchors), axis=0)
    row = row.repeat(len(anchors), axis=0)
    anchors = np.array(anchors)
    anchors = anchors.reshape(*anchors.shape, 1, 1)  # (3, 2, 1, 1)

    box_xy = position[:,:2,:,:]*2 - 0.5
    box_wh = pow(position[:,2:4,:,:]*2, 2) * anchors

    box_xy += grid
    box_xy *= stride
    box = np.concatenate((box_xy, box_wh), axis=1)   # (3, 4, 80, 80)

    # Convert [c_x, c_y, w, h] to [x1, y1, x2, y2]
    xyxy = np.copy(box)
    xyxy[:, 0, :, :] = box[:, 0, :, :] - box[:, 2, :, :]/ 2  # top left x
    xyxy[:, 1, :, :] = box[:, 1, :, :] - box[:, 3, :, :]/ 2  # top left y
    xyxy[:, 2, :, :] = box[:, 0, :, :] + box[:, 2, :, :]/ 2  # bottom right x
    xyxy[:, 3, :, :] = box[:, 1, :, :] + box[:, 3, :, :]/ 2  # bottom right y

    return xyxy
#
def filter_boxes(boxes, box_confidences, box_class_probs):
    """Filter boxes with object threshold.
    """
    print(f'filter_boxes:boxes:{boxes}')
    print(f'filter_boxes:box_confidences:{box_confidences}')
    print(f'filter_boxes:box_class_probs:{box_class_probs}')

    """Filter boxes with object threshold."""
    box_confidences = box_confidences.reshape(-1)
    class_max_score = np.max(box_class_probs, axis=-1)
    classes = np.argmax(box_class_probs, axis=-1)
    print(f'filter_boxes:box_confidences:{box_confidences}')
    print(f'filter_boxes:class_max_score:{class_max_score}')
    print(f'filter_boxes:classes:{classes}')

    _class_pos = np.where(class_max_score * box_confidences >= OBJ_THRESH)

    print(f'_class_pos:{_class_pos}')
    scores = (class_max_score * box_confidences)[_class_pos]

    boxes = boxes[_class_pos]
    classes = classes[_class_pos]
    print(f'boxes:{boxes}')
    print(f'classes:{classes}')
    print(f'scores:{scores}')
    return boxes, classes, scores

def nms_boxes(boxes, scores):
    """Suppress non-maximal boxes.
    # Returns
        keep: ndarray, index of effective boxes.
    """
    x = boxes[:, 0]
    y = boxes[:, 1]
    w = boxes[:, 2] - boxes[:, 0]
    h = boxes[:, 3] - boxes[:, 1]

    areas = w * h
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        xx1 = np.maximum(x[i], x[order[1:]])
        yy1 = np.maximum(y[i], y[order[1:]])
        xx2 = np.minimum(x[i] + w[i], x[order[1:]] + w[order[1:]])
        yy2 = np.minimum(y[i] + h[i], y[order[1:]] + h[order[1:]])

        w1 = np.maximum(0.0, xx2 - xx1 + 0.00001)
        h1 = np.maximum(0.0, yy2 - yy1 + 0.00001)
        inter = w1 * h1

        ovr = inter / (areas[i] + areas[order[1:]] - inter)
        inds = np.where(ovr <= NMS_THRESH)[0]
        order = order[inds + 1]
    keep = np.array(keep)
    return keep

def post_process(input_data, anchors):
    boxes, scores, classes_conf = [], [], []
    # 1*255*h*w -> 3*85*h*w
    input_data = [_in.reshape([len(anchors[0]),-1]+list(_in.shape[-2:])) for _in in input_data]
    for i in range(len(input_data)):                                    # (3, 85, 80, 80)
        boxes.append(box_process(input_data[i][:,:4,:,:], anchors[i]))  # (3, 4, 80, 80)
        scores.append(input_data[i][:,4:5,:,:])                         # (3, 1, 80, 80)
        classes_conf.append(input_data[i][:,5:,:,:])                    # (3, 80, 80, 80)

    def sp_flatten(_in):
        ch = _in.shape[1]
        _in = _in.transpose(0,2,3,1)
        return _in.reshape(-1, ch)

    boxes = [sp_flatten(_v) for _v in boxes]                  # (3, 19200, 4)
    classes_conf = [sp_flatten(_v) for _v in classes_conf]    # (3, 19200, 80)
    scores = [sp_flatten(_v) for _v in scores]                # (3, 19200, 1)

    boxes = np.concatenate(boxes)                  # (25200, 4)
    classes_conf = np.concatenate(classes_conf)    # (25200, 80)
    scores = np.concatenate(scores)                # (25200, 1)

    # filter according to threshold
    boxes, classes, scores = filter_boxes(boxes, scores, classes_conf)
    # (12, 4)  12  12

    # nms
    nboxes, nclasses, nscores = [], [], []

    for c in set(classes):
        inds = np.where(classes == c)
        b = boxes[inds]
        c = classes[inds]
        s = scores[inds]
        keep = nms_boxes(b, s)

        if len(keep) != 0:
            nboxes.append(b[keep])
            nclasses.append(c[keep])
            nscores.append(s[keep])

    if not nclasses and not nscores:
        return None, None, None

    boxes = np.concatenate(nboxes)
    classes = np.concatenate(nclasses)
    scores = np.concatenate(nscores)

    return boxes, classes, scores

def draw(image, boxes, scores, classes):
    for box, score, cl in zip(boxes, scores, classes):
        top, left, right, bottom = [int(_b) for _b in box]
        # top += x
        # left += y
        # right += x
        # bottom += y
        print("%s @ (%d %d %d %d) %.3f" % (CLASSES[cl], top, left, right, bottom, score))
        arealeft,areatop,areawidth,areaheight = BOX
        cv2.rectangle(image, (arealeft, areatop), (arealeft+areawidth,areatop+areaheight), (0, 0, 255), 2)
        cv2.rectangle(image, (top, left), (right, bottom), (255, 0, 0), 2)
        cv2.putText(image, '{0} {1:.2f}'.format(CLASSES[cl], score),
                    (top, left - 6), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

def letterbox(im, new_shape=(640, 640), color=(0, 0, 0), letter_box_info_list=[]):
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])

    ratio = r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    # dw, dh = np.mod(dw, 32), np.mod(dh, 32)

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    letter_box_info_list.append(Letter_Box_Info(shape, new_shape, ratio, ratio, dw, dh, color))
    return im, letter_box_info_list

def get_real_box(box, in_format='xyxy', letter_box_info_list=[]):
    bbox = copy(box)
    # unletter_box result
    if in_format=='xyxy':
        bbox[:,0] -= letter_box_info_list[-1].dw
        bbox[:,0] /= letter_box_info_list[-1].w_ratio
        bbox[:,0] = np.clip(bbox[:,0], 0, letter_box_info_list[-1].origin_shape[1])

        bbox[:,1] -= letter_box_info_list[-1].dh
        bbox[:,1] /= letter_box_info_list[-1].h_ratio
        bbox[:,1] = np.clip(bbox[:,1], 0, letter_box_info_list[-1].origin_shape[0])

        bbox[:,2] -= letter_box_info_list[-1].dw
        bbox[:,2] /= letter_box_info_list[-1].w_ratio
        bbox[:,2] = np.clip(bbox[:,2], 0, letter_box_info_list[-1].origin_shape[1])

        bbox[:,3] -= letter_box_info_list[-1].dh
        bbox[:,3] /= letter_box_info_list[-1].h_ratio
        bbox[:,3] = np.clip(bbox[:,3], 0, letter_box_info_list[-1].origin_shape[0])
    return bbox


def yolo_run(yolo_rknn, frame):

    img0 = frame[y:y + h, x:x + w, :]
    img, letter_box_info_list = letterbox(im= img0.copy(), new_shape=(IMG_SIZE[1], IMG_SIZE[0]))  # padded resize

    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # HWC to CHW, BGR to RGB

    if len(img.shape) == 3:
        img = img[None]  # expand for batch dim

    outputs = yolo_rknn.inference(inputs=[img])  # Inference

    boxes, classes, scores = post_process(outputs, anchors)
    boxes_filter, scores_filter, classes_filter = [0, 0, 0, 0], [], []
    max_box = [0, 0, 0, 0]
    for box, score, cl in zip(boxes, scores, classes):
        if cl == 0:
            if (box[2]-box[0])*(box[3]-box[1]) > (max_box[2]-max_box[0])*(max_box[3]-max_box[1]):
                max_box = box
                boxes_filter = np.expand_dims(max_box, axis=0)
                scores_filter = np.expand_dims(score, axis=0)
                classes_filter = np.expand_dims(cl, axis=0)
    # img_p = img0.copy()
    yolo_box = get_real_box(boxes_filter, 'xyxy', letter_box_info_list)
    yolo_box[0][0] += x
    yolo_box[0][1] += y
    yolo_box[0][2] += x
    yolo_box[0][3] += y

    draw(frame, yolo_box, scores_filter, classes_filter)
    # cv2.imwrite("11.jpg", frame)

    return yolo_box[0]

最后是主函数inference.py：

import cv2
import time
from hrnet_inference import myFunc00
from rknnlite.api import RKNNLite
from yolo_inference import yolo_run



rknn_model = './models/rktest.rknn'
yolo_model = './models/yolotest.rknn'

yolo_rknn = RKNNLite()
print('--> Load YOLO RKNN model')
yolo_ret = yolo_rknn.load_rknn(yolo_model)
if yolo_ret != 0:
    print('Load YOLO RKNN model failed')
    exit(yolo_ret)
print('done')
yolo_ret = yolo_rknn.init_runtime()
if yolo_ret != 0:
    print('Init runtime environment failed!')
    exit(yolo_ret)
print('done')

hrnet_rknn = RKNNLite()
print('--> Load HRNet RKNN model')
hrnet_ret = hrnet_rknn.load_rknn(rknn_model)
if hrnet_ret != 0:
    print('Load HRNet RKNN model failed')
    exit(hrnet_ret)
print('done')
hrnet_ret = hrnet_rknn.init_runtime()
if hrnet_ret != 0:
    print('Init runtime environment failed!')
    exit(hrnet_ret)
print('done')

cap = cv2.VideoCapture('./input/000.mp4')

frames, loopTime, initTime = 0, time.time(), time.time()
pTime = 0
while (cap.isOpened()):
    frames += 1
    ret, frame = cap.read()
    if not ret:
        break
    try:
        yolo_box = yolo_run(yolo_rknn, frame)
    except:
        continue

    frame = myFunc00(hrnet_rknn, frame, yolo_box)

    cTime = time.time()
    fps = 1 / (cTime - pTime)
    pTime = cTime
    cv2.putText(frame, str(int(fps)), (50, 50), cv2.FONT_HERSHEY_PLAIN, 3, (0, 255, 0), 3)
    frame = cv2.resize(frame, (int(frame.shape[1] / 2), int(frame.shape[0] / 2)))
    cv2.imshow('test', frame)
    cv2.imwrite("11.jpg", frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
    if frames % 30 == 0:
        print("30帧平均帧率:\t", 30 / (time.time() - loopTime), "帧")
        loopTime = time.time()

print("总平均帧率\t", frames / (time.time() - initTime))
# 释放cap和rknn线程池
cap.release()
cv2.destroyAllWindows()
yolo_rknn.release()
hrnet_rknn.release()