昇腾ACL部署yolo分类+检测+分割，

最新推荐文章于 2025-06-09 23:54:15 发布
嗯呐，是的
最新推荐文章于 2025-06-09 23:54:15 发布
阅读量391
点赞数 3
文章标签： YOLO 目标检测分类算法人工智能
本文链接：https://blog.youkuaiyun.com/qq_40828839/article/details/146068177
版权
基于昇腾310B 部署Yolov8系列工程，利用华为CANN提供的acl工具进行开发，实现了分类、检测、分割，以及如何调用。
import acl
import numpy as np
import colorsys
import copy
from PIL import Image
import cv2
import time
import os
# from utils import cvtColor, preprocess_input, resize_image

ROOT= os.getcwd()
#*************************分类**************************
class ACL_Yolov8_cls(object):
    def __init__(self, config):
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single
              
        #模型输入参数
        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.classes = config["classes"]

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存，并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            # print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            # print("ret",ret)
            data = acl.create_data_buffer(buffer, buffer_size)
            # print("data",data.size())
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            # data = np.frombuffer(bytes_out, dtype=np.float16).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        # bytes_data = img.tobytes()
        bytes_data = img.tostring()
        # bytes_data=img.tobytes("F")
        # print("bytes_data",bytes_data[0:50])
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # 将图片数据从Host传输到Device。
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()


    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)

    def infer(self, img_):
        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)

        time0 = time.time()
        model_cls.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')
        
        time1 = time.time()
        model_cls.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds = model_cls.process_output()[0]
        print(f'data copy to host time cost:{time.time() - time2}')
        # print("**********",preds)
        #后处理
        result_dic={}
        for index in range(len(self.classes)):
            classname= self.classes[index]
            result_dic[classname] = preds[:,index]

        return result_dic                #输出[1,class]fp32

#*************************检测**************************
class ACL_Yolov8_det(object):
    def __init__(self, config):
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single

        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.conf_threshold= config["conf_thres"]
        self.iou_threshold = config["iou_thres"]
        self.classes = config["classes"]
    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存，并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        # 将图片数据从Host传输到Device。
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()

    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        print("image_data",img.shape)
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)

    def postprocess_v8(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold):
        """
        Post-process the prediction.

        Args:
            preds (Numpy.ndarray): predictions come from ort.session.run().
            im0 (Numpy.ndarray): [h, w, c] original input image.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
            conf_threshold (float): conf threshold.
            iou_threshold (float): iou threshold.

        Returns:
            boxes (List): list of bounding boxes.
        """
        color_palette = np.random.uniform(0, 255, size=(len(self.classes), 3))

        x = preds  # outputs: predictions (1, 84, 8400)
        # Transpose the first output: (Batch_size, xywh_conf_cls, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls)
        x = np.einsum('bcn->bnc', x)  # (1, 8400, 84)

        # Predictions filtering by conf-threshold
        x = x[np.amax(x[..., 4:], axis=-1) > conf_threshold]

        # Create a new matrix which merge these(box, score, cls) into one
        # For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
        x = np.c_[x[..., :4], np.amax(x[..., 4:], axis=-1), np.argmax(x[..., 4:], axis=-1)]

        # NMS filtering
        # 经过NMS后的值, np.array([[x, y, w, h, conf, cls], ...]), shape=(-1, 4 + 1 + 1)
        x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]

        rois = []
        class_ids = []
        scores = []

        # 重新缩放边界框，为画图做准备
        if len(x) > 0:
            # Bounding boxes format change: cxcywh -> xyxy
            x[..., [0, 1]] -= x[..., [2, 3]] / 2
            x[..., [2, 3]] += x[..., [0, 1]]

            # Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
            x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
            x[..., :4] /= min(ratio)

            # Bounding boxes boundary clamp
            x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
            x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])

            boxes= x[..., :6]

            # 提取区域置信度和类别 ID
            rois = boxes[:, :4].astype(int).tolist()
            scores = boxes[:, 4].tolist()
            class_ids = boxes[:, 5].astype(int).tolist()
            # # 构造目标输出格式
            # result = {
            #     'rois': rois,
            #     'class_ids': class_ids,
            #     'scores': scores
            # }

            # Draw rectangles
            for (*box, conf, cls_) in boxes:
                cv2.rectangle(im0, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
                              color_palette[int(cls_)], 2, cv2.LINE_AA)
                cv2.putText(im0, f'{self.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 0, 255), 2, cv2.LINE_AA)

            return rois,im0,class_ids  # boxes
            # return result,im0  # boxes
        else:
            # result = {
            #     'rois': rois,
            #     'class_ids': class_ids,
            #     'scores': scores
            # }
            # return result,im0
            print("No bounding boxes detected.")
            return rois,im0,class_ids

    def infer(self,img_):

        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)
        print(f'image preprocess time cost:{time.time() - time_}')

        model_det.init(self.model_path)
        # print("img_shape**********:",img.shape)
        time0 = time.time()
        model_det.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')

        time1 = time.time()
        model_det.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds = model_det.process_output()[0]
        print(f'data copy to host time cost:{time.time() - time2}')


        boxes,img_ = self.postprocess_v8(preds,
                                        im0=img_,
                                        ratio=ratio,
                                        pad_w=pad_w,
                                        pad_h=pad_h,
                                        conf_threshold=self.conf_threshold,
                                        iou_threshold=self.iou_threshold,
                                        )
        model_det.destory()
        return boxes,img_

#*************************分割**************************
class ACL_Yolov8_seg(object):
    def __init__(self, config):
    
        if os.path.isfile(config["weights"]):
            self.model_path = config["weights"]
        else:
            self.model_path = os.path.join(ROOT, 'weights', config["weights"])
        self.device_id = config["device_id"]

        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []
        self.ndtype = np.single     # Numpy dtype: support both FP32(np.single) and FP16(np.half) om model

        self.imgsz=config["img_size"]
        self.model_height, self.model_width = self.imgsz[0], self.imgsz[1]  # 图像resize大小
        self.conf_threshold= config["conf_thres"]
        self.iou_threshold = config["iou_thres"]
        self.classes = config["classes"]

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存，并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        # print("input_size",input_size)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            print("buffer_size",buffer_size)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            # shape = tuple(dims[i]["dims"])
            shape = tuple(dims[0]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # print("self.input_data[0]",self.input_data[0]["buffer"])
        # print("self.input_data[0]",self.input_data[0]["size"])
        # 将图片数据从Host传输到Device。
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()



    def preprocessing(self, img):
        """
        Pre-processes the input image.

        Args:
            img (Numpy.ndarray): image about to be processed.

        Returns:
            img_process (Numpy.ndarray): image preprocessed for inference.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
        """
        # Resize and pad input image using letterbox() (Borrowed from Ultralytics)
        shape = img.shape[:2]  # original image shape
        new_shape = (self.model_height, self.model_width)
        r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
        ratio = r, r
        new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
        pad_w, pad_h = (new_shape[1] - new_unpad[0]) / 2, (new_shape[0] - new_unpad[1]) / 2  # wh padding
        if shape[::-1] != new_unpad:  # resize
            img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)

        top, bottom = int(round(pad_h - 0.1)), int(round(pad_h + 0.1))
        left, right = int(round(pad_w - 0.1)), int(round(pad_w + 0.1))
        img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114))  # 填充

        #2024-12-29
        # image_data=img
        # image_data  = np.expand_dims(np.transpose(preprocess_input(np.array(image_data, np.float32)), (2, 0, 1)), 0)

        # print("image_data",image_data.shape)
        # Transforms: HWC to CHW -> BGR to RGB -> div(255) -> contiguous -> add axis(optional)
        img = np.ascontiguousarray(np.einsum('HWC->CHW', img)[::-1], dtype=self.ndtype) / 255.0
        print("image_data",img.shape)
        img_process = img[None] if len(img.shape) == 3 else img
        return img_process, ratio, (pad_w, pad_h)
    # YOLOv8/9/11通用后处理，包括：阈值过滤与NMS+masks处理
    def postprocess_v8(self, preds, im0, ratio, pad_w, pad_h, conf_threshold, iou_threshold, nm=32):
        """
        Post-process the prediction.

        Args:
            preds (Numpy.ndarray): predictions come from ort.session.run().
            im0 (Numpy.ndarray): [h, w, c] original input image.
            ratio (tuple): width, height ratios in letterbox.
            pad_w (float): width padding in letterbox.
            pad_h (float): height padding in letterbox.
            conf_threshold (float): conf threshold.
            iou_threshold (float): iou threshold.
            nm (int): the number of masks.

        Returns:
            boxes (List): list of bounding boxes.
            segments (List): list of segments.
            masks (np.ndarray): [N, H, W], output masks.
        """
        x, protos = preds[0], preds[1]  # 与bbox区别：Two outputs: 检测头的输出(1, 116, 8400), 分割头的输出(1, 32, 160, 160)

        # Transpose the first output: (Batch_size, xywh_conf_cls_nm, Num_anchors) -> (Batch_size, Num_anchors, xywh_conf_cls_nm)
        x = np.einsum('bcn->bnc', x)  # (1, 8400, 116)
   
        # Predictions filtering by conf-threshold，不包括后32维的向量（32维的向量可以看作是与每个检测框关联的分割 mask 的系数或权重）
        x = x[np.amax(x[..., 4:-nm], axis=-1) > conf_threshold]

        # Create a new matrix which merge these(box, score, cls, nm) into one
        # For more details about `numpy.c_()`: https://numpy.org/doc/1.26/reference/generated/numpy.c_.html
        x = np.c_[x[..., :4], np.amax(x[..., 4:-nm], axis=-1), np.argmax(x[..., 4:-nm], axis=-1), x[..., -nm:]]

        # NMS filtering
        # 经过NMS后的值, np.array([[x, y, w, h, conf, cls, nm], ...]), shape=(-1, 4 + 1 + 1 + 32)
        x = x[cv2.dnn.NMSBoxes(x[:, :4], x[:, 4], conf_threshold, iou_threshold)]
        
        status=1
        # 重新缩放边界框，为画图做准备
        if len(x) > 0:
            # Bounding boxes format change: cxcywh -> xyxy
            x[..., [0, 1]] -= x[..., [2, 3]] / 2
            x[..., [2, 3]] += x[..., [0, 1]]

            # Rescales bounding boxes from model shape(model_height, model_width) to the shape of original image
            x[..., :4] -= [pad_w, pad_h, pad_w, pad_h]
            x[..., :4] /= min(ratio)

            # Bounding boxes boundary clamp
            x[..., [0, 2]] = x[:, [0, 2]].clip(0, im0.shape[1])
            x[..., [1, 3]] = x[:, [1, 3]].clip(0, im0.shape[0])

            # 与bbox区别：增加masks处理
            # Process masks
            masks = self.process_mask(protos[0], x[:, 6:], x[:, :4], im0.shape)
            # Masks -> Segments(contours)
            segments = self.masks2segments(masks)

            return x[..., :6], masks ,segments,status # boxes, masks ,segments, status    //xywh-id-class,掩码，掩码轮廓，状态
        else:
            return [], [], [] , status

    @staticmethod
    def masks2segments(masks):
        """
        It takes a list of masks(n,h,w) and returns a list of segments(n,xy) (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L750)

        Args:
            masks (numpy.ndarray): the output of the model, which is a tensor of shape (batch_size, 160, 160).

        Returns:
            segments (List): list of segment masks.
        """
        segments = []
        for x in masks.astype('uint8'):
            c = cv2.findContours(x, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]  # CHAIN_APPROX_SIMPLE  该函数用于查找二值图像中的轮廓。
            if c:
                # 这段代码的目的是找到图像x中的最外层轮廓，并从中选择最长的轮廓，然后将其转换为NumPy数组的形式。
                c = np.array(c[np.array([len(x) for x in c]).argmax()]).reshape(-1, 2)
            else:
                c = np.zeros((0, 2))  # no segments found
            segments.append(c.astype('float32'))
        return segments

    def process_mask(self, protos, masks_in, bboxes, im0_shape):
        """
        Takes the output of the mask head, and applies the mask to the bounding boxes. This produces masks of higher quality
        but is slower. (Borrowed from https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L618)

        Args:
            protos (numpy.ndarray): [mask_dim, mask_h, mask_w].
            masks_in (numpy.ndarray): [n, mask_dim], n is number of masks after nms.
            bboxes (numpy.ndarray): bboxes re-scaled to original image shape.
            im0_shape (tuple): the size of the input image (h,w,c).

        Returns:
            (numpy.ndarray): The upsampled masks.
        """
        c, mh, mw = protos.shape
        masks = np.matmul(masks_in, protos.reshape((c, -1))).reshape((-1, mh, mw)).transpose(1, 2, 0)  # HWN
        masks = np.ascontiguousarray(masks)
        # masks = self.scale_mask(masks, im0_shape)  # re-scale mask from P3 shape to original input image shape
        masks = np.einsum('HWN -> NHW', masks)  # HWN -> NHW
        masks = self.crop_mask(masks, bboxes)
        return np.greater(masks, 0.5)

    @staticmethod
    def scale_mask(masks, im0_shape, ratio_pad=None):
        """
        Takes a mask, and resizes it to the original image size. (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L305)

        Args:
            masks (np.ndarray): resized and padded masks/images, [h, w, num]/[h, w, 3].
            im0_shape (tuple): the original image shape.
            ratio_pad (tuple): the ratio of the padding to the original image.

        Returns:
            masks (np.ndarray): The masks that are being returned.
        """
        im1_shape = masks.shape[:2]
        if ratio_pad is None:  # calculate from im0_shape
            gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1])  # gain  = old / new
            pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2  # wh padding
        else:
            pad = ratio_pad[1]

        # Calculate tlbr of mask
        top, left = int(round(pad[1] - 0.1)), int(round(pad[0] - 0.1))  # y, x
        bottom, right = int(round(im1_shape[0] - pad[1] + 0.1)), int(round(im1_shape[1] - pad[0] + 0.1))
        if len(masks.shape) < 2:
            raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
        masks = masks[top:bottom, left:right]
        masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]),
                           interpolation=cv2.INTER_LINEAR)  # INTER_CUBIC would be better
        if len(masks.shape) == 2:
            masks = masks[:, :, None]
        return masks
    
    @staticmethod
    def crop_mask(masks, boxes):
        """
        It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box. (Borrowed from
        https://github.com/ultralytics/ultralytics/blob/465df3024f44fa97d4fad9986530d5a13cdabdca/ultralytics/utils/ops.py#L599)

        Args:
            masks (Numpy.ndarray): [n, h, w] tensor of masks.
            boxes (Numpy.ndarray): [n, 4] tensor of bbox coordinates in relative point form.

        Returns:
            (Numpy.ndarray): The masks are being cropped to the bounding box.
        """
        n, h, w = masks.shape
        x1, y1, x2, y2 = np.split(boxes[:, :, None], 4, 1)
        r = np.arange(w, dtype=x1.dtype)[None, None, :]
        c = np.arange(h, dtype=x1.dtype)[None, :, None]
        return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
    
    def infer(self,img_):
        time_ = time.time()
        img, ratio, (pad_w, pad_h) = self.preprocessing(img_)

        #*******************与onnx结果对比验证用************************
        # img = cv2.cvtColor(img_, cv2.COLOR_BGR2RGB)
        # img = cv2.resize(img, (640, 640))
        # img = img.astype(np.float32)
        # img /= 255.0
        # mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
        # std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
        # img = (img - mean) / std
        # img = np.transpose(img, (2, 0, 1))
        # img = np.expand_dims(img, axis=0)        # 形成一个batch
        #*******************与onnx结果对比验证用************************

        print(f'image preprocess time cost:{time.time() - time_}')
        time0 = time.time()
        model_seg.load_input_data(img)
        print(f'data copy to device time cost:{time.time() - time0}')

        time1 = time.time()
        model_seg.execute()
        print(f'device inference time cost:{time.time() - time1}')


        time2 = time.time()
        preds=model_seg.process_output()
        print(f'data copy to host time cost:{time.time() - time2}')

        boxes, segments, masks,statu_= self.postprocess_v8(preds,
                        im0=img_,
                        ratio=ratio,
                        pad_w=pad_w,
                        pad_h=pad_h,
                        conf_threshold=self.conf_threshold,
                        iou_threshold=self.iou_threshold,
                        )

        return boxes,  masks ,segments,statu_         # boxes, masks ,segments, status    //xywh-id-class, 掩码，掩码轮廓，状态
       
        # # Draw rectangles and polygons
        # im_canvas = im0.copy()
        # for (*box, conf, cls_), segment in zip(boxes, segments):
        #     # draw contour and fill mask
        #     cv2.polylines(im0, np.int32([segment]), True, (255, 255, 255), 2)  # white borderline
        #     cv2.fillPoly(im_canvas, np.int32([segment]), (255, 0, 0))

        #     # draw bbox rectangle
        #     cv2.rectangle(im0, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
        #                 color_palette[int(cls_)], 1, cv2.LINE_AA)
        #     cv2.putText(im0, f'{args.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
        #                 cv2.FONT_HERSHEY_SIMPLEX, 0.7, color_palette[int(cls_)], 2, cv2.LINE_AA)

        # # Mix image
        # im0 = cv2.addWeighted(im_canvas, 0.3, im0, 0.7, 0)

        # return im0


if __name__ == "__main__":
    #****************yolov8分类配置文件***********************
    cfg_cls = {
        "weights":'./weights/best_cls.om',
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['coal',"mohu"]
    }
    #**********************yolov8检测配置文件*********************
    cfg_det= {
        "weights":'/mnt/data/yz/yolov8/weights/digital-number.om',
        "conf_thres": 0.5,
        "iou_thres": 0.4,
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['dial_3', 'dial_4']
    }

    #**********************yolov8分割配置文件*********************
    cfg_seg= {
        "weights":'/mnt/data/yz/yolov8/weights/coalseg_0108_jhw.om',
        "conf_thres": 0.5,
        "iou_thres": 0.4,
        "img_size": [640, 640],
        "device_id": 0,
        'classes': ['dial_3', 'dial_4']
    }
  


    # image_path=os.path.join(ROOT, "test_img/street.jpg")
    image_path= "./test_img/001.png"
    img_ = cv2.imread(image_path)
    # img_= Image.open(image_path)
    # image_ = Image.fromarray(cv2.cvtColor(img_,cv2.COLOR_BGR2RGB)) 
#*****************yolov8分类模型********************************
    # model_cls=ACL_Yolov8_cls(cfg_cls)
    # model_cls.init(cfg_cls["weights"])
    # result_cls=model_cls.infer(img_)                    #输出：dict{classname:confidence,classname:confidence}

#*****************yolov8检测模型********************************
    # model_det=ACL_Yolov8_det(cfg_det)
    # model_det.init(cfg_det["weights"])
    # result_det,img_res=model_det.infer(img_)            #输出：result_det = {'rois': rois,'class_ids': class_ids,'scores': scores}  ,img_res 结果图

#*****************yolov8分割模型********************************
    model_seg=ACL_Yolov8_seg(cfg_seg)
    model_seg.init(cfg_seg["weights"])
    boxes,  masks ,segments,_ =model_seg.infer(img_)                    #输出：boxes,分割区域、掩码
    print("done")
    # # 如何需要画图请参考下面
      # color_palette = np.random.uniform(0, 255, size=(len(cfg_seg["classes"]), 3))  # 为每个类别生成调色板
    # im_canvas = img_.copy()
    # for (*box, conf, cls_), segment in zip(boxes, segments):
    #     # draw contour and fill mask
    #     cv2.polylines(img_, np.int32([segment]), True, (255, 255, 255), 2)  # white borderline
    #     cv2.fillPoly(im_canvas, np.int32([segment]), (255, 0, 0))

    #     # draw bbox rectangle
    #     cv2.rectangle(img_, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])),
    #                 color_palette[int(cls_)], 1, cv2.LINE_AA)
    #     cv2.putText(img_, f'{args.classes[int(cls_)]}: {conf:.3f}', (int(box[0]), int(box[1] - 9)),
    #                 cv2.FONT_HERSHEY_SIMPLEX, 0.7, color_palette[int(cls_)], 2, cv2.LINE_AA)
    # # Mix image
    # img_ = cv2.addWeighted(im_canvas, 0.3, img_, 0.7, 0)
    # cv2.imwrite("aaa.jpg", img_)