rk3308使用YOLOX本地端调用USB摄像头实时检测_yolox改为用usb摄像头实时监测-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_45977690/article/details/132562294

一、训练yolox模型

下载yolox

git clone https://github.com/Megvii-BaseDetection/YOLOX.git

下载yolox-nano的权重文件：https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.pth

环境

创建一个conda虚拟环境yolox，主要安装YOLOX/requirements.txt下载必要的库

conda activate yolox
cd YOLOX
python setup.py develop

配置文件

在路径YOLOX/exps/example/yolox_voc下创建一个文件yolox_voc_nano.py

# encoding: utf-8
import os

import torch
import torch.distributed as dist

import torch.nn as nn

from yolox.data import get_yolox_datadir
from yolox.exp import Exp as MyExp


class Exp(MyExp):
    def __init__(self):
        super(Exp, self).__init__()
        self.num_classes = 3 # 类别数量

        self.depth = 0.33
        self.width = 0.25
        self.input_size = (416, 416)
        self.random_size = (10, 20)
        self.mosaic_scale = (0.5, 1.5)
        self.test_size = (416, 416)
        self.max_epoch = 500
        self.enable_mixup = False
        self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]

    def get_model(self, sublinear=False):

        def init_yolo(M):
            for m in M.modules():
                if isinstance(m, nn.BatchNorm2d):
                    m.eps = 1e-3
                    m.momentum = 0.03
        if "model" not in self.__dict__:
            from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
            in_channels = [256, 512, 1024]
            # NANO model use depthwise = True, which is main difference.
            backbone = YOLOPAFPN(
                self.depth, self.width, in_channels=in_channels,
                act=self.act, depthwise=True,
            )
            head = YOLOXHead(
                self.num_classes, self.width, in_channels=in_channels,
                act=self.act, depthwise=True
            )
            self.model = YOLOX(backbone, head)

        self.model.apply(init_yolo)
        self.model.head.initialize_biases(1e-2)
        return self.model

    def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
        from yolox.data import (
            VOCDetection,
            TrainTransform,
            YoloBatchSampler,
            DataLoader,
            InfiniteSampler,
            MosaicDetection,
            worker_init_reset_seed,
        )
        from yolox.utils import (
            wait_for_the_master,
            get_local_rank,
        )
        local_rank = get_local_rank()

        with wait_for_the_master(local_rank):
            dataset = VOCDetection(
                data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
                image_sets=[('2007', 'trainval')],
                img_size=self.input_size,
                preproc=TrainTransform(
                    max_labels=50,
                    flip_prob=self.flip_prob,
                    hsv_prob=self.hsv_prob),
                cache=cache_img,
            )

        dataset = MosaicDetection(
            dataset,
            mosaic=not no_aug,
            img_size=self.input_size,
            preproc=TrainTransform(
                max_labels=120,
                flip_prob=self.flip_prob,
                hsv_prob=self.hsv_prob),
            degrees=self.degrees,
            translate=self.translate,
            mosaic_scale=self.mosaic_scale,
            mixup_scale=self.mixup_scale,
            shear=self.shear,
            enable_mixup=self.enable_mixup,
            mosaic_prob=self.mosaic_prob,
            mixup_prob=self.mixup_prob,
        )

        self.dataset = dataset

        if is_distributed:
            batch_size = batch_size // dist.get_world_size()

        sampler = InfiniteSampler(
            len(self.dataset), seed=self.seed if self.seed else 0
        )

        batch_sampler = YoloBatchSampler(
            sampler=sampler,
            batch_size=batch_size,
            drop_last=False,
            mosaic=not no_aug,
        )

        dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
        dataloader_kwargs["batch_sampler"] = batch_sampler

        # Make sure each process has different random seed, especially for 'fork' method
        dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed

        train_loader = DataLoader(self.dataset, **dataloader_kwargs)

        return train_loader

    def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
        from yolox.data import VOCDetection, ValTransform

        valdataset = VOCDetection(
            data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
            image_sets=[('2007', 'test')],
            img_size=self.test_size,
            preproc=ValTransform(legacy=legacy),
        )

        if is_distributed:
            batch_size = batch_size // dist.get_world_size()
            sampler = torch.utils.data.distributed.DistributedSampler(
                valdataset, shuffle=False
            )
        else:
            sampler = torch.utils.data.SequentialSampler(valdataset)

        dataloader_kwargs = {
            "num_workers": self.data_num_workers,
            "pin_memory": True,
            "sampler": sampler,
        }
        dataloader_kwargs["batch_size"] = batch_size
        val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)

        return val_loader

    def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
        from yolox.evaluators import VOCEvaluator

        val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
        evaluator = VOCEvaluator(
            dataloader=val_loader,
            img_size=self.test_size,
            confthre=self.test_conf,
            nmsthre=self.nmsthre,
            num_classes=self.num_classes,
        )
        return evaluator

修改YOLOX/yolox/data/datasets/voc_classes.py中VOC_CLASSES为自己的类别名称。
如果训练电脑为windows系统的还需要修改YOLOX/yolox/data/datasets/voc.py

# 第一处原始
path_filename = [
            (self._imgpath % self.ids[i]).split(self.root + "/")[1]
            for i in range(self.num_imgs)
        ]
# 第一处修改
    path_filename = [
            (self._imgpath % self.ids[i]).split(self.root + "\\")[1]
            for i in range(self.num_imgs)
        ]
# 第二处原始
annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
# 第二处修改
annopath = os.path.join(rootpath, "Annotations") + "\\{:s}.xml"

否则训练的时候会报错如下

  # 第一处报错
  File "e:\projects\codes\yolox\yolox\data\datasets\voc.py", line 137, in __init__
    path_filename = [

  File "e:\projects\codes\yolox\yolox\data\datasets\voc.py", line 138, in <listcomp>
    (self._imgpath % self.ids[i]).split(self.root + "/")[1]
     │    │          │    │   │         │    └ 'e:\\projects\\codes\\yolox\\datasets\\VOCdevkit'
     │    │          │    │   │         └ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>
     │    │          │    │   └ 0
     │    │          │    └ [('e:\\projects\\codes\\yolox\\datasets\\VOCdevkit\\VOC2007', 'bc-057'), ('e:\\projects\\codes\\yolox\\...
     │    │          └ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>
     │    └ '%s\\JPEGImages\\%s.jpg'
     └ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>

IndexError: list index out of range
# 第二处报错
  File "e:\projects\codes\yolox\yolox\evaluators\voc_eval.py", line 16, in parse_rec
    tree = ET.parse(filename)
           │  │     └ 'bc-064.xml'
           │  └ <function parse at 0x0000022CFD6D15E0>
           └ <module 'xml.etree.ElementTree' from 'F:\\Anaconda3\\envs\\yolox\\lib\\xml\\etree\\ElementTree.py'>

  File "F:\Anaconda3\envs\yolox\lib\xml\etree\ElementTree.py", line 1202, in parse
    tree.parse(source, parser)
    │    │     │       └ None
    │    │     └ 'bc-064.xml'
    │    └ <function ElementTree.parse at 0x0000022CFD6D0670>
    └ <xml.etree.ElementTree.ElementTree object at 0x0000022CFFDB20A0>

  File "F:\Anaconda3\envs\yolox\lib\xml\etree\ElementTree.py", line 584, in parse
    source = open(source, "rb")
                  └ 'bc-064.xml'

FileNotFoundError: [Errno 2] No such file or directory: 'bc-064.xml'

训练电脑为windows系统还需要修改YOLOX/yolox/data/datasets/voc.py

数据集格式

在这里插入图片描述

训练命令

python tools/train.py -f exps/example/yolox_voc/yolox_voc_nano.py -d 0 -b 24 --fp16 -o -c yolox_nano.pth

查看训练情况

cd YOLOX/YOLOX_outputs/yolox_voc_nano/tensorboard
tensorboard --logdir=./

在这里插入图片描述

二、转换模型

将pth转为onnx格式

python tools/export_onnx.py --output-name yolox_nano_self.onnx -f exps/example/yolox_voc/yolox_voc_nano.py -c YOLOX_outputs\yolox_voc_nano\best_ckpt.pth

则在项目目录下生成了yolox_nano_self.onnx的模型文件

参考博客基于NCNN将YOLOX部署到Android的2.3.2~2.3.4节将onnx转为bin和param格式

三、部署到rk3308

参考博客RK3308B部署mobilenetv2_ssdlite模型进行部署，其中main.cpp换为下方脚本

#include <iostream>
#include <thread>
#include <mutex>
#include <chrono>
#include <opencv2/opencv.hpp>

#include "layer.h"
#include "net.h"

#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>

#define YOLOX_NMS_THRESH  0.45 // nms threshold
#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
#define YOLOX_TARGET_SIZE 416  // target image size after resize, might use 416 for small model

#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else // _WIN32
#include <sys/time.h>
#endif // _WIN32
using namespace std;
using namespace cv;

#ifdef _WIN32
// 单位ms
double get_current_time()
{
    LARGE_INTEGER freq;
    LARGE_INTEGER pc;
    QueryPerformanceFrequency(&freq);
    QueryPerformanceCounter(&pc);

    return pc.QuadPart * 1000.0 / freq.QuadPart;
}
#else  // _WIN32
// 单位ms
double get_current_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}
#endif // _WIN32

class YoloV5Focus : public ncnn::Layer
{
public:
    YoloV5Focus()
    {
        one_blob_only = true;
    }

    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;

        int outw = w / 2;
        int outh = h / 2;
        int outc = channels * 4;

        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p = 0; p < outc; p++)
        {
            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
            float* outptr = top_blob.channel(p);

            for (int i = 0; i < outh; i++)
            {
                for (int j = 0; j < outw; j++)
                {
                    *outptr = *ptr;

                    outptr += 1;
                    ptr += 2;
                }

                ptr += w;
            }
        }

        return 0;
    }
};

DEFINE_LAYER_CREATOR(YoloV5Focus)

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

struct GridAndStride
{
    int grid0;
    int grid1;
    int stride;
};

static inline float intersection_area(const Object& a, const Object& b)
{
    cv::Rect_<float> inter = a.rect & b.rect;
    return inter.area();
}

static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
    int i = left;
    int j = right;
    float p = faceobjects[(left + right) / 2].prob;

    while (i <= j)
    {
        while (faceobjects[i].prob > p)
            i++;

        while (faceobjects[j].prob < p)
            j--;

        if (i <= j)
        {
            // swap
            std::swap(faceobjects[i], faceobjects[j]);

            i++;
            j--;
        }
    }

    #pragma omp parallel sections
    {
        #pragma omp section
        {
            if (left < j) qsort_descent_inplace(faceobjects, left, j);
        }
        #pragma omp section
        {
            if (i < right) qsort_descent_inplace(faceobjects, i, right);
        }
    }
}

static void qsort_descent_inplace(std::vector<Object>& objects)
{
    if (objects.empty())
        return;

    qsort_descent_inplace(objects, 0, objects.size() - 1);
}

static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
    picked.clear();

    const int n = faceobjects.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
    {
        areas[i] = faceobjects[i].rect.area();
    }

    for (int i = 0; i < n; i++)
    {
        const Object& a = faceobjects[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
        {
            const Object& b = faceobjects[picked[j]];

            if (!agnostic && a.label != b.label)
                continue;

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
            // float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;
        }

        if (keep)
            picked.push_back(i);
    }
}

static void generate_grids_and_stride(const int target_w, const int target_h, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
{
    for (int i = 0; i < (int)strides.size(); i++)
    {
        int stride = strides[i];
        int num_grid_w = target_w / stride;
        int num_grid_h = target_h / stride;
        for (int g1 = 0; g1 < num_grid_h; g1++)
        {
            for (int g0 = 0; g0 < num_grid_w; g0++)
            {
                GridAndStride gs;
                gs.grid0 = g0;
                gs.grid1 = g1;
                gs.stride = stride;
                grid_strides.push_back(gs);
            }
        }
    }
}

static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
    const int num_grid = feat_blob.h;
    const int num_class = feat_blob.w - 5;
    const int num_anchors = grid_strides.size();

    const float* feat_ptr = feat_blob.channel(0);
    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
    {
        const int grid0 = grid_strides[anchor_idx].grid0;
        const int grid1 = grid_strides[anchor_idx].grid1;
        const int stride = grid_strides[anchor_idx].stride;

        // yolox/models/yolo_head.py decode logic
        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
        float x_center = (feat_ptr[0] + grid0) * stride;
        float y_center = (feat_ptr[1] + grid1) * stride;
        float w = exp(feat_ptr[2]) * stride;
        float h = exp(feat_ptr[3]) * stride;
        float x0 = x_center - w * 0.5f;
        float y0 = y_center - h * 0.5f;

        float box_objectness = feat_ptr[4];
        for (int class_idx = 0; class_idx < num_class; class_idx++)
        {
            float box_cls_score = feat_ptr[5 + class_idx];
            float box_prob = box_objectness * box_cls_score;
            if (box_prob > prob_threshold)
            {
                Object obj;
                obj.rect.x = x0;
                obj.rect.y = y0;
                obj.rect.width = w;
                obj.rect.height = h;
                obj.label = class_idx;
                obj.prob = box_prob;

                objects.push_back(obj);
            }

        } // class loop
        feat_ptr += feat_blob.w;

    } // point anchor loop
}
    
static int 
detect_yolox(ncnn::Net &yolox, const cv::Mat& bgr, std::vector<Object>& objects)
{
    int img_w = bgr.cols;
    int img_h = bgr.rows;

    int w = img_w;
    int h = img_h;
    float scale = 1.f;
    if (w > h)
    {
        scale = (float)YOLOX_TARGET_SIZE / w;
        w = YOLOX_TARGET_SIZE;
        h = h * scale;
    }
    else
    {
        scale = (float)YOLOX_TARGET_SIZE / h;
        h = YOLOX_TARGET_SIZE;
        w = w * scale;
    }
    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);

    // pad to YOLOX_TARGET_SIZE rectangle
    int wpad = (w + 31) / 32 * 32 - w;
    int hpad = (h + 31) / 32 * 32 - h;
    ncnn::Mat in_pad;
    // different from yolov5, yolox only pad on bottom and right side,
    // which means users don't need to extra padding info to decode boxes coordinate.
    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);

    ncnn::Extractor ex = yolox.create_extractor();

    ex.input("images", in_pad);

    std::vector<Object> proposals;

    {
        ncnn::Mat out;
        ex.extract("output", out);

        static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
        std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0]));
        std::vector<GridAndStride> grid_strides;
        generate_grids_and_stride(in_pad.w, in_pad.h, strides, grid_strides);
        generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals);
    }

    // sort all proposals by score from highest to lowest
    qsort_descent_inplace(proposals);

    // apply nms with nms_threshold
    std::vector<int> picked;
    nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);

    int count = picked.size();
    if (count == 0)
    {
        // fprintf(stderr, "%s\n", "No Object!");
        return 0;
    }
    objects.resize(count);
    for (int i = 0; i < count; i++)
    {
        objects[i] = proposals[picked[i]];

        // adjust offset to original unpadded
        float x0 = (objects[i].rect.x) / scale;
        float y0 = (objects[i].rect.y) / scale;
        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;

        // clip
        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);

        objects[i].rect.x = x0;
        objects[i].rect.y = y0;
        objects[i].rect.width = x1 - x0;
        objects[i].rect.height = y1 - y0;
    }
    return 1;
}
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
    static const char* class_names[] = {
        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
        "hair drier", "toothbrush"
    };

    cv::Mat image = bgr.clone();

    for (size_t i = 0; i < objects.size(); i++)
    {
        const Object& obj = objects[i];

        fprintf(stderr, "%s = %.1f%% at %.2f %.2f %.2f x %.2f\n", class_names[obj.label], obj.prob * 100,
                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);

        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));

        char text[256];
        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);

        int baseLine = 0;
        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

        int x = obj.rect.x;
        int y = obj.rect.y - label_size.height - baseLine;
        if (y < 0)
            y = 0;
        if (x + label_size.width > image.cols)
            x = image.cols - label_size.width;

        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
                      cv::Scalar(255, 255, 255), -1);

        cv::putText(image, text, cv::Point(x, y + label_size.height),
                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
    }

    cv::imshow("image", image);
    cv::waitKey(30);
}

std::mutex mtx; // 互斥锁，用于保护共享资源
cv::Mat img;

void capture_thread() {
    cv::VideoCapture cap(0); // 打开默认的摄像头
    if (!cap.isOpened()) {
        fprintf(stderr, "Error:%s\n", "unable to open camera!!");
        return;
    }
    while (true) {
        mtx.lock(); // 加锁
        cap >> img; // 读取一帧图像
        if (img.empty()) {
            fprintf(stderr, "Error:%s\n", "Failed to capture frame!");
            continue;
        }
        mtx.unlock(); // 解锁
        this_thread::sleep_for(chrono::milliseconds(5));
    }
}

void process_thread() {
    ncnn::Net yolox;
    yolox.opt.use_vulkan_compute = false;
    yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
    if (yolox.load_param("yolox-nano.param"))
        exit(-1);
    if (yolox.load_model("yolox-nano.bin"))
        exit(-1);

    while (true) {
        double start = get_current_time();
        std::vector<Object> objects;
        mtx.lock(); // 加锁
        cv::Mat frame = img.clone();
        mtx.unlock(); // 解锁
        if (frame.empty()) {
            this_thread::sleep_for(chrono::milliseconds(10)); // 等待10毫秒
            continue;
        }
        detect_yolox(yolox, frame, objects);
        double end = get_current_time();
        double time = end - start;
        fprintf(stderr, "time is %.2f ms\n", time);
    }
}

int main() {
    std::thread t1(capture_thread); // 创建线程1
    std::thread t2(process_thread); // 创建线程2
    t1.join(); // 等待线程1结束
    t2.join(); // 等待线程2结束
    return 0;
}

rk3308上运行速度如下

time is 1034.17 ms
cup = 96.6% at 280.54 121.89 287.21 x 300.59
time is 1056.07 ms
cup = 96.6% at 280.68 121.91 286.94 x 300.85
time is 1081.28 ms
cup = 96.6% at 280.65 122.05 287.22 x 300.58
time is 1022.46 ms
cup = 96.6% at 280.51 122.04 287.22 x 300.80
time is 1101.37 ms
time is 1027.54 ms
time is 1047.46 ms
time is 1076.67 ms