昇腾ACL部署yolo分类+检测+分割,

基于昇腾310B 部署Yolov8系列工程,利用华为CANN提供的acl工具进行开发,实现了分类、检测、分割,以及如何调用。

import acl
import numpy as np
import colorsys
import copy
from PIL import Image
import cv2
import time
import os
# from utils import cvtColor, preprocess_input, resize_image

ROOT= os.getcwd()
#*************************分类**************************
class ACL_Yolov8_cls(object):
    def __init__(self, config):
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single
              
        #模型输入参数
        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.classes = config["classes"]

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存,并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            # print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            # print("ret",ret)
            data = acl.create_data_buffer(buffer, buffer_size)
            # print("data",data.size())
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            # data = np.frombuffer(bytes_out, dtype=np.float16).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        # bytes_data = img.tobytes()
        bytes_data = img.tostring()
        # bytes_data=img.tobytes("F")
        # print("bytes_data",bytes_data[0:50])
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # 将图片数据从Host传输到Device。
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()


    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)

    def infer(self, img_):
        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)

        time0 = time.time()
        model_cls.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')
        
        time1 = time.time()
        model_cls.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds = model_cls.process_output()[0]
        print(f'data copy to host time cost:{time.time() - time2}')
        # print("**********",preds)
        #后处理
        result_dic={}
        for index in range(len(self.classes)):
            classname= self.classes[index]
            result_dic[classname] = preds[:,index]

        return result_dic                #输出[1,class]fp32

#*************************检测**************************
class ACL_Yolov8_det(object):
    def __init__(self, config):
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single

        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.conf_threshold= config["conf_thres"]
        self.iou_threshold = config["iou_thres"]
        self.classes = config["classes"]
    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存,并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        # 将图片数据从Host传输到Device。
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()

    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        print("image_data",img.shape)
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)

    def postprocess_v8(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold):
        """
        Post-process the prediction.

        Args:
            preds (Numpy.ndarray): predictions come from ort.session.run().
            im0 (Numpy.ndarray): [h, w, c] original input image.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
            conf_threshold (float): conf threshold.
            iou_threshold (float): iou threshold.

        Returns:
            boxes (List): list of bounding boxes.
        """
        color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3))

        x = preds  # outputs: predictions (1, 84, 8400)
        # Transpose the first output: (Batch_size, xywh_conf_cls, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls)
        x = np.einsum('bcn->bnc', x)  # (1, 8400, 84)

        # Predictions filtering by conf-threshold
        x = x[np.amax(x[..., 4:], axis=-1) > conf_threshold]

        # Create a new matrix which merge these(box, score, cls) into one
        # For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
        x = np.c_[x[..., :4], np.amax(x[..., 4:], axis=-1), np.argmax(x[..., 4:], axis=-1)]

        # NMS filtering
        # 经过NMS后的值, np.array([[x, y, w, h, conf, cls], ...]), shape=(-1, 4 + 1 + 1)
        x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]

        rois = []
        class_ids = []
        scores = []

        # 重新缩放边界框,为画图做准备
        if len(x) > 0:
            # Bounding boxes format change: cxcywh -> xyxy
            x[..., [0, 1]] -= x[..., [2, 3]] / 2
            x[..., [2, 3]] += x[..., [0, 1]]

            # Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
            x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
            x[..., :4] /= min(ratio)

            # Bounding boxes boundary clamp
            x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
            x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])

            boxes= x[..., :6]

            # 提取区域置信度和类别 ID
            rois = boxes[:, :4].astype(int).tolist()
            scores = boxes[:, 4].tolist()
            class_ids = boxes[:, 5].astype(int).tolist()
            # # 构造目标输出格式
            # result = {
            #     'rois': rois,
            #     'class_ids': class_ids,
            #     'scores': scores
            # }

            # Draw rectangles
            for (*box, conf, cls_) in boxes:
                cv2.rectangle(im0, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
                              color_palette[int(cls_)], 2, cv2.LINE_AA)
                cv2.putText(im0, f'{self.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2, cv2.LINE_AA)

            return rois,im0,class_ids  # boxes
            # return result,im0  # boxes
        else:
            # result = {
            #     'rois': rois,
            #     'class_ids': class_ids,
            #     'scores': scores
            # }
            # return result,im0
            print("No bounding boxes detected.")
            return rois,im0,class_ids

    def infer(self,img_):

        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)
        print(f'image preprocess time cost:{time.time() - time_}')

        model_det.init(self.model_path)
        # print("img_shape**********:",img.shape)
        time0 = time.time()
        model_det.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')

        time1 = time.time()
        model_det.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds = model_det.process_output()[0]
        print(f'data copy to host time cost:{time.time() - time2}')


        boxes,img_ = self.postprocess_v8(preds,
                                        im0=img_,
                                        ratio=ratio,
                                        pad_w=pad_w,
                                        pad_h=pad_h,
                                        conf_threshold=self.conf_threshold,
                                        iou_threshold=self.iou_threshold,
                                        )
        model_det.destory()
        return boxes,img_

#*************************分割**************************
class ACL_Yolov8_seg(object):
    def __init__(self, config):
    
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single     # Numpy dtype: support both FP32(np.single) and FP16(np.half) om model

        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.conf_threshold= config["conf_thres"]
        self.iou_threshold = config["iou_thres"]
        self.classes = config["classes"]

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存,并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            # shape = tuple(dims[i]["dims"])
            shape = tuple(dims[0]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        # 将图片数据从Host传输到Device。
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()



    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        print("image_data",img.shape)
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)
    # YOLOv8/9/11通用后处理,包括:阈值过滤与NMS+masks处理
    def postprocess_v8(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold, nm=32):
        """
        Post-process the prediction.

        Args:
            preds (Numpy.ndarray): predictions come from ort.session.run().
            im0 (Numpy.ndarray): [h, w, c] original input image.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
            conf_threshold (float): conf threshold.
            iou_threshold (float): iou threshold.
            nm (int): the number of masks.

        Returns:
            boxes (List): list of bounding boxes.
            segments (List): list of segments.
            masks (np.ndarray): [N, H, W], output masks.
        """
        x, protos = preds[0], preds[1]  # 与bbox区别:Two outputs: 检测头的输出(1, 116, 8400), 分割头的输出(1, 32, 160, 160)

        # Transpose the first output: (Batch_size, xywh_conf_cls_nm, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls_nm)
        x = np.einsum('bcn->bnc', x)  # (1, 8400, 116)
   
        # Predictions filtering by conf-threshold,不包括后32维的向量(32维的向量可以看作是与每个检测框关联的分割 mask 的系数或权重)
        x = x[np.amax(x[..., 4:-nm], axis=-1) > conf_threshold]

        # Create a new matrix which merge these(box, score, cls, nm) into one
        # For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
        x = np.c_[x[..., :4], np.amax(x[..., 4:-nm], axis=-1), np.argmax(x[..., 4:-nm], axis=-1), x[..., -nm:]]

        # NMS filtering
        # 经过NMS后的值, np.array([[x, y, w, h, conf, cls, nm], ...]), shape=(-1, 4 + 1 + 1 + 32)
        x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]
        
        status=1
        # 重新缩放边界框,为画图做准备
        if len(x) > 0:
            # Bounding boxes format change: cxcywh -> xyxy
            x[..., [0, 1]] -= x[..., [2, 3]] / 2
            x[..., [2, 3]] += x[..., [0, 1]]

            # Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
            x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
            x[..., :4] /= min(ratio)

            # Bounding boxes boundary clamp
            x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
            x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])

            # 与bbox区别:增加masks处理
            # Process masks
            masks = self.process_mask(protos[0], x[:, 6:], x[:, :4], im0.shape)
            # Masks -> Segments(contours)
            segments = self.masks2segments(masks)

            return x[..., :6], masks ,segments,status # boxes, masks ,segments, status    //xywh-id-class,掩码,掩码轮廓,状态
        else:
            return [], [], [] , status

    @staticmethod
    def masks2segments(masks):
        """
        It takes a list of masks(n,h,w) and returns a list of segments(n,xy) (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L750)

        Args:
            masks (numpy.ndarray): the output of the model, which is a tensor of shape (batch_size, 160, 160).

        Returns:
            segments (List): list of segment masks.
        """
        segments = []
        for x in masks.astype('uint8'):
            c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]  # CHAIN_APPROX_SIMPLE  该函数用于查找二值图像中的轮廓。
            if c:
                # 这段代码的目的是找到图像x中的最外层轮廓,并从中选择最长的轮廓,然后将其转换为NumPy数组的形式。
                c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
            else:
                c = np.zeros((0, 2))  # no segments found
            segments.append(c.astype('float32'))
        return segments

    def process_mask(self, protos, masks_in, bboxes, im0_shape):
        """
        Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher quality
        but is slower. (Borrowed from https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L618)

        Args:
            protos (numpy.ndarray): [mask_dim, mask_h, mask_w].
            masks_in (numpy.ndarray): [n, mask_dim], n is number of masks after nms.
            bboxes (numpy.ndarray): bboxes re-scaled to original image shape.
            im0_shape (tuple): the size of the input image (h,w,c).

        Returns:
            (numpy.ndarray): The upsampled masks.
        """
        c, mh, mw = protos.shape
        masks = np.matmul(masks_in, protos.reshape((c, -1))).reshape((-1, mh, mw)).transpose(1, 2, 0)  # HWN
        masks = np.ascontiguousarray(masks)
        # masks = self.scale_mask(masks, im0_shape)  # re-scale mask from P3 shape to original input image shape
        masks = np.einsum('HWN -> NHW', masks)  # HWN -> NHW
        masks = self.crop_mask(masks, bboxes)
        return np.greater(masks, 0.5)

    @staticmethod
    def scale_mask(masks, im0_shape, ratio_pad=None):
        """
        Takes a mask, and resizes it to the original image size. (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L305)

        Args:
            masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
            im0_shape (tuple): the original image shape.
            ratio_pad (tuple): the ratio of the padding to the original image.

        Returns:
            masks (np.ndarray): The masks that are being returned.
        """
        im1_shape = masks.shape[:2]
        if ratio_pad is None:  # calculate from im0_shape
            gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1])  # gain  = old / new
            pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2  # wh padding
        else:
            pad = ratio_pad[1]

        # Calculate tlbr of mask
        top, left = int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1))  # y, x
        bottom, right = int(round(im1_shape[0] - pad[1] + 0.1)), int(round(im1_shape[1] - pad[0] + 0.1))
        if len(masks.shape) < 2:
            raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
        masks = masks[top:bottom, left:right]
        masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]),
                           interpolation=cv2.INTER_LINEAR)  # INTER_CUBIC would be better
        if len(masks.shape) == 2:
            masks = masks[:, :, None]
        return masks
    
    @staticmethod
    def crop_mask(masks, boxes):
        """
        It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box. (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L599)

        Args:
            masks (Numpy.ndarray): [n, h, w] tensor of masks.
            boxes (Numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form.

        Returns:
            (Numpy.ndarray): The masks are being cropped to the bounding box.
        """
        n, h, w = masks.shape
        x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)
        r = np.arange(w, dtype=x1.dtype)[None, None, :]
        c = np.arange(h, dtype=x1.dtype)[None, :, None]
        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
    
    def infer(self,img_):
        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)

        #*******************与onnx结果对比验证用************************
        # img = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
        # img = cv2.resize(img, (640, 640))
        # img = img.astype(np.float32)
        # img /= 255.0
        # mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        # std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        # img = (img - mean) / std
        # img = np.transpose(img, (2, 0, 1))
        # img = np.expand_dims(img, axis=0)        # 形成一个batch
        #*******************与onnx结果对比验证用************************

        print(f'image preprocess time cost:{time.time() - time_}')
        time0 = time.time()
        model_seg.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')

        time1 = time.time()
        model_seg.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds=model_seg.process_output()
        print(f'data copy to host time cost:{time.time() - time2}')

        boxes, segments, masks,statu_= self.postprocess_v8(preds,
                        im0=img_,
                        ratio=ratio,
                        pad_w=pad_w,
                        pad_h=pad_h,
                        conf_threshold=self.conf_threshold,
                        iou_threshold=self.iou_threshold,
                        )

        return boxes,  masks ,segments,statu_         # boxes, masks ,segments, status    //xywh-id-class, 掩码,掩码轮廓,状态
       
        # # Draw rectangles and polygons
        # im_canvas = im0.copy()
        # for (*box, conf, cls_), segment in zip(boxes, segments):
        #     # draw contour and fill mask
        #     cv2.polylines(im0, np.int32([segment]), True, (255, 255, 255), 2)  # white borderline
        #     cv2.fillPoly(im_canvas, np.int32([segment]), (255, 0, 0))

        #     # draw bbox rectangle
        #     cv2.rectangle(im0, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
        #                 color_palette[int(cls_)], 1, cv2.LINE_AA)
        #     cv2.putText(im0, f'{args.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
        #                 cv2.FONT_HERSHEY_SIMPLEX, 0.7, color_palette[int(cls_)], 2, cv2.LINE_AA)

        # # Mix image
        # im0 = cv2.addWeighted(im_canvas, 0.3, im0, 0.7, 0)

        # return im0


if __name__ == "__main__":
    #****************yolov8分类配置文件***********************
    cfg_cls = {
        "weights":'./weights/best_cls.om',
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['coal',"mohu"]
    }
    #**********************yolov8检测配置文件*********************
    cfg_det= {
        "weights":'/mnt/data/yz/yolov8/weights/digital-number.om',
        "conf_thres": 0.5,
        "iou_thres": 0.4,
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['dial_3', 'dial_4']
    }

    #**********************yolov8分割配置文件*********************
    cfg_seg= {
        "weights":'/mnt/data/yz/yolov8/weights/coalseg_0108_jhw.om',
        "conf_thres": 0.5,
        "iou_thres": 0.4,
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['dial_3', 'dial_4']
    }
  


    # image_path=os.path.join(ROOT, "test_img/street.jpg")
    image_path= "./test_img/001.png"
    img_ = cv2.imread(image_path)
    # img_= Image.open(image_path)
    # image_ = Image.fromarray(cv2.cvtColor(img_,cv2.COLOR_BGR2RGB)) 
#*****************yolov8分类模型********************************
    # model_cls=ACL_Yolov8_cls(cfg_cls)
    # model_cls.init(cfg_cls["weights"])
    # result_cls=model_cls.infer(img_)                    #输出:dict{classname:confidence,classname:confidence}

#*****************yolov8检测模型********************************
    # model_det=ACL_Yolov8_det(cfg_det)
    # model_det.init(cfg_det["weights"])
    # result_det,img_res=model_det.infer(img_)            #输出:result_det = {'rois': rois,'class_ids': class_ids,'scores': scores}  ,img_res 结果图

#*****************yolov8分割模型********************************
    model_seg=ACL_Yolov8_seg(cfg_seg)
    model_seg.init(cfg_seg["weights"])
    boxes,  masks ,segments,_ =model_seg.infer(img_)                    #输出:boxes,分割区域、掩码
    print("done")
    # # 如何需要画图请参考下面
      # color_palette = np.random.uniform(0, 255, size=(len(cfg_seg["classes"]), 3))  # 为每个类别生成调色板
    # im_canvas = img_.copy()
    # for (*box, conf, cls_), segment in zip(boxes, segments):
    #     # draw contour and fill mask
    #     cv2.polylines(img_, np.int32([segment]), True, (255, 255, 255), 2)  # white borderline
    #     cv2.fillPoly(im_canvas, np.int32([segment]), (255, 0, 0))

    #     # draw bbox rectangle
    #     cv2.rectangle(img_, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
    #                 color_palette[int(cls_)], 1, cv2.LINE_AA)
    #     cv2.putText(img_, f'{args.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
    #                 cv2.FONT_HERSHEY_SIMPLEX, 0.7, color_palette[int(cls_)], 2, cv2.LINE_AA)
    # # Mix image
    # img_ = cv2.addWeighted(im_canvas, 0.3, img_, 0.7, 0)
    # cv2.imwrite("aaa.jpg", img_)
<think>嗯,用户之前问的是华为Atlas 300V的模型部署引擎,现在接着问YOLO系列目标检测的区别。看来他们对目标检测领域比较感兴趣,可能是在实际项目中需要选择适合的模型,或者在做技术调研。 首先,用户可能已经了解了华为的硬件部署,现在想了解不同YOLO模型的区别,可能是为了在Atlas 300V上部署合适的YOLO版本。需要明确各个YOLO版本的特点,尤其是性能和效率,这对边缘设备部署很重要。 用户可能希望知道哪个YOLO版本在精度和速度之间平衡最好,或者哪个更适合他们的应用场景,比如安防监控、交通检测等。同时,他们可能关心不同版本的计算需求,是否适配华为的Ascend芯片,比如模型是否支持ONNX转换,或者是否需要特定优化。 需要回顾YOLO的发展历程,从v1到v8,以及可能的衍生版本。每个版本的改进点,比如YOLOv3的Darknet-53,v4的CSP结构,v5的易用性,v6和v7的实时优化,v8的多任务支持。要突出每个版本的核心创新和应用场景,比如v3适合通用检测,v5适合工业界快速部署,v8适合需要实例分割的情况。 另外,用户可能对如何选择版本有疑问,需要给出选择建议,比如在边缘设备上推荐v5或v7,因为速度和效率高;如果需要更高精度,可能考虑v8。同时,可能需要注意不同版本在华为平台上的支持情况,比如模型转换工具是否兼容,是否需要额外的工作量。 还要考虑用户是否有特定需求,比如是否需要处理视频流,多路并发,这时候YOLO的推理速度尤为关键。可能还需要提到模型量化,比如INT8量化在Ascend芯片上的支持情况,不同YOLO版本是否容易进行量化优化。 最后,总结时应该简明扼要,对比各版本的优缺点,帮助用户根据实际需求做出选择。可能用户没有明确说出他们的具体应用场景,所以需要给出通用建议,同时提示如果有更多细节可以提供更针对性的回答。</think>YOLO(You Only Look Once)系列是目标检测领域的经典算法,其核心特点是**单阶段(one-stage)检测**和**实时性**。从 YOLOv1 到 YOLOv8,每一代的改进都围绕**精度、速度、模型轻量化**展开。以下是各版本的**核心区别**与演进方向: --- ### **1. YOLOv1(2016)** - **核心思想**:将图像划分为 \( S \times S \) 网格,每个网格预测边界框(BBox)和类别概率。 - **特点**: - **首次实现端到端实时检测**(45 FPS); - 直接回归 BBox 坐标,速度快但定位精度低; - 对小目标和密集目标检测效果差。 - **局限性**:仅预测每个网格的 2 个框,漏检率高。 --- ### **2. YOLOv2(2017)** - **改进点**: - **Anchor Boxes**:引入预定义的 Anchor 框(基于数据集聚类),提升定位精度; - **多尺度预测**:使用不同分辨率的特征图检测不同尺度的目标; - **Batch Normalization**:提升训练稳定性和收敛速度; - **High Resolution Classifier**:输入分辨率提升至 \( 448 \times 448 \)。 - **结果**:mAP 从 v1 的 63.4% 提升至 78.6%,速度保持实时性。 --- ### **3. YOLOv3(2018)** - **关键创新**: - **Darknet-53 主干网络**:结合残差结构(ResNet),提升特征提取能力; - **多尺度预测(FPN)**:3 种不同尺度的特征图(类似 FPN),增强小目标检测; - **分类器改为多标签逻辑回归**,支持重叠类别检测。 - **性能**:在 COCO 数据集上 mAP@0.5 达 57.9%,速度 20-30 FPS(Titan X)。 - **缺点**:模型参数量大,边缘设备部署困难。 --- ### **4. YOLOv4(2020)** - **优化重点**:**速度与精度的平衡**(适合 GPU 部署)。 - **核心改进**: - **CSPDarknet53**:减少计算量,提升推理速度; - **PANet(Path Aggregation Network)**:加强特征融合; - **Mish 激活函数**:增强非线性表达能力; - **数据增强策略**:CutMix、Mosaic 等提升泛化性。 - **结果**:COCO 上 mAP 达 43.5% (AP50),速度 65 FPS(Tesla V100)。 --- ### **5. YOLOv5(2020)** - **非官方但广泛使用**(由 Ultralytics 发布)。 - **改进方向**:**工程友好性**与**部署便捷性**。 - **亮点**: - **自适应锚框计算**:自动根据数据集调整 Anchor 尺寸; - **Focus 结构**:切片操作减少计算量; - **PyTorch 生态集成**:支持 ONNX、TensorRT 导出,易于部署(如华为 Atlas 平台); - **模块化设计**:提供 YOLOv5s/m/l/x 等不同参数量模型。 - **性能**:YOLOv5s 仅 7M 参数量,mAP@0.5 达 56.8%,速度 140 FPS(Tesla T4)。 --- ### **6. YOLOv6(2022,美团团队)** - **优化目标**:**工业场景的高效部署**。 - **关键技术**: - **RepVGG 风格主干网络**:训练时多分支,推理时单分支,兼顾精度与速度; - **Anchor-free 设计**:直接预测目标中心点,减少超参数依赖; - **SIoU Loss**:改进边界框回归损失函数,提升定位精度。 - **优势**:在边缘设备(如 Jetson AGX Xavier)上推理速度比 YOLOv5 快 20%。 --- ### **7. YOLOv7(2022)** - **核心创新**:**模型缩放与重参数化**。 - **改进点**: - **Extended Efficient Layer Aggregation (E-ELAN)**:动态调整特征融合路径; - **模型缩放技术**:通过复合系数统一缩放深度、宽度、分辨率; - **辅助训练头(Aux Head)**:提升训练效率。 - **性能**:相同速度下,mAP 比 YOLOv5 高 5-10%。 --- ### **8. YOLOv8(2023,Ultralytics)** - **最新版本**,支持**目标检测、实例分割、姿态估计**多任务。 - **主要改进**: - **Anchor-free 设计**:简化输出头,减少计算量; - **C2f 模块**:跨阶段部分连接,增强特征复用; - **动态标签分配(Task-Aligned Assigner)**:根据分类与回归质量分配正样本; - **Mosaic 增强升级**:结合 Copy-Paste 策略,提升小目标检测。 - **部署优势**:支持 ONNX、TensorRT、OpenVINO 等多种格式,适配华为昇腾芯片(需通过 ATC 工具转换 OM 模型)。 --- ### **各版本对比总结** | 版本 | 核心优势 | 适用场景 | 缺点 | |-------|--| | v1-v3 | 基础架构,实时性好 | 学术研究、简单检测任务 | 精度低,小目标漏检率高 | | v4 | 精度与速度平衡(GPU 优化) | 服务器端高性能推理 | 参数量大,边缘设备部署困难 | | v5 | 工程友好,易于部署 | 工业界快速落地(如 Atlas 300V 边缘端) | 非官方版本,学术认可度低 | | v6 | 边缘设备高效推理 | 嵌入式设备、智慧交通摄像头 | 社区生态较弱 | | v7 | 重参数化技术提升精度 | 高精度需求场景(如医学影像) | 训练资源消耗较大 | | v8 | 多任务支持,动态标签分配 | 复杂场景(检测+分割+姿态) | 模型灵活性可能增加调参难度 | --- ### **选择建议** 1. **边缘设备(如 Atlas 300V)**:优先选 **YOLOv5s/v7-tiny**,兼顾速度与精度; 2. **服务器端高精度场景**:选择 **YOLOv8x 或 YOLOv7**; 3. **多任务需求**:直接使用 **YOLOv8** 的分割/姿态估计扩展功能。 如果需要具体版本的部署示例(如华为昇腾平台适配方法),可进一步说明需求!
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值