一、训练yolox模型
- 下载yolox
git clone https://github.com/Megvii-BaseDetection/YOLOX.git
下载yolox-nano的权重文件:https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_nano.pth
- 环境
创建一个conda虚拟环境yolox,主要安装YOLOX/requirements.txt
下载必要的库
conda activate yolox
cd YOLOX
python setup.py develop
- 配置文件
- 在路径
YOLOX/exps/example/yolox_voc
下创建一个文件yolox_voc_nano.py
# encoding: utf-8
import os
import torch
import torch.distributed as dist
import torch.nn as nn
from yolox.data import get_yolox_datadir
from yolox.exp import Exp as MyExp
class Exp(MyExp):
def __init__(self):
super(Exp, self).__init__()
self.num_classes = 3 # 类别数量
self.depth = 0.33
self.width = 0.25
self.input_size = (416, 416)
self.random_size = (10, 20)
self.mosaic_scale = (0.5, 1.5)
self.test_size = (416, 416)
self.max_epoch = 500
self.enable_mixup = False
self.exp_name = os.path.split(os.path.realpath(__file__))[1].split(".")[0]
def get_model(self, sublinear=False):
def init_yolo(M):
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if "model" not in self.__dict__:
from yolox.models import YOLOX, YOLOPAFPN, YOLOXHead
in_channels = [256, 512, 1024]
# NANO model use depthwise = True, which is main difference.
backbone = YOLOPAFPN(
self.depth, self.width, in_channels=in_channels,
act=self.act, depthwise=True,
)
head = YOLOXHead(
self.num_classes, self.width, in_channels=in_channels,
act=self.act, depthwise=True
)
self.model = YOLOX(backbone, head)
self.model.apply(init_yolo)
self.model.head.initialize_biases(1e-2)
return self.model
def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img=False):
from yolox.data import (
VOCDetection,
TrainTransform,
YoloBatchSampler,
DataLoader,
InfiniteSampler,
MosaicDetection,
worker_init_reset_seed,
)
from yolox.utils import (
wait_for_the_master,
get_local_rank,
)
local_rank = get_local_rank()
with wait_for_the_master(local_rank):
dataset = VOCDetection(
data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
image_sets=[('2007', 'trainval')],
img_size=self.input_size,
preproc=TrainTransform(
max_labels=50,
flip_prob=self.flip_prob,
hsv_prob=self.hsv_prob),
cache=cache_img,
)
dataset = MosaicDetection(
dataset,
mosaic=not no_aug,
img_size=self.input_size,
preproc=TrainTransform(
max_labels=120,
flip_prob=self.flip_prob,
hsv_prob=self.hsv_prob),
degrees=self.degrees,
translate=self.translate,
mosaic_scale=self.mosaic_scale,
mixup_scale=self.mixup_scale,
shear=self.shear,
enable_mixup=self.enable_mixup,
mosaic_prob=self.mosaic_prob,
mixup_prob=self.mixup_prob,
)
self.dataset = dataset
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = InfiniteSampler(
len(self.dataset), seed=self.seed if self.seed else 0
)
batch_sampler = YoloBatchSampler(
sampler=sampler,
batch_size=batch_size,
drop_last=False,
mosaic=not no_aug,
)
dataloader_kwargs = {"num_workers": self.data_num_workers, "pin_memory": True}
dataloader_kwargs["batch_sampler"] = batch_sampler
# Make sure each process has different random seed, especially for 'fork' method
dataloader_kwargs["worker_init_fn"] = worker_init_reset_seed
train_loader = DataLoader(self.dataset, **dataloader_kwargs)
return train_loader
def get_eval_loader(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.data import VOCDetection, ValTransform
valdataset = VOCDetection(
data_dir=os.path.join(get_yolox_datadir(), "VOCdevkit"),
image_sets=[('2007', 'test')],
img_size=self.test_size,
preproc=ValTransform(legacy=legacy),
)
if is_distributed:
batch_size = batch_size // dist.get_world_size()
sampler = torch.utils.data.distributed.DistributedSampler(
valdataset, shuffle=False
)
else:
sampler = torch.utils.data.SequentialSampler(valdataset)
dataloader_kwargs = {
"num_workers": self.data_num_workers,
"pin_memory": True,
"sampler": sampler,
}
dataloader_kwargs["batch_size"] = batch_size
val_loader = torch.utils.data.DataLoader(valdataset, **dataloader_kwargs)
return val_loader
def get_evaluator(self, batch_size, is_distributed, testdev=False, legacy=False):
from yolox.evaluators import VOCEvaluator
val_loader = self.get_eval_loader(batch_size, is_distributed, testdev, legacy)
evaluator = VOCEvaluator(
dataloader=val_loader,
img_size=self.test_size,
confthre=self.test_conf,
nmsthre=self.nmsthre,
num_classes=self.num_classes,
)
return evaluator
-
修改
YOLOX/yolox/data/datasets/voc_classes.py
中VOC_CLASSES
为自己的类别名称。 -
如果训练电脑为windows系统的还需要修改
YOLOX/yolox/data/datasets/voc.py
# 第一处原始
path_filename = [
(self._imgpath % self.ids[i]).split(self.root + "/")[1]
for i in range(self.num_imgs)
]
# 第一处修改
path_filename = [
(self._imgpath % self.ids[i]).split(self.root + "\\")[1]
for i in range(self.num_imgs)
]
# 第二处原始
annopath = os.path.join(rootpath, "Annotations", "{:s}.xml")
# 第二处修改
annopath = os.path.join(rootpath, "Annotations") + "\\{:s}.xml"
否则训练的时候会报错如下
# 第一处报错
File "e:\projects\codes\yolox\yolox\data\datasets\voc.py", line 137, in __init__
path_filename = [
File "e:\projects\codes\yolox\yolox\data\datasets\voc.py", line 138, in <listcomp>
(self._imgpath % self.ids[i]).split(self.root + "/")[1]
│ │ │ │ │ │ └ 'e:\\projects\\codes\\yolox\\datasets\\VOCdevkit'
│ │ │ │ │ └ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>
│ │ │ │ └ 0
│ │ │ └ [('e:\\projects\\codes\\yolox\\datasets\\VOCdevkit\\VOC2007', 'bc-057'), ('e:\\projects\\codes\\yolox\\...
│ │ └ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>
│ └ '%s\\JPEGImages\\%s.jpg'
└ <yolox.data.datasets.voc.VOCDetection object at 0x0000021DD1A8C610>
IndexError: list index out of range
# 第二处报错
File "e:\projects\codes\yolox\yolox\evaluators\voc_eval.py", line 16, in parse_rec
tree = ET.parse(filename)
│ │ └ 'bc-064.xml'
│ └ <function parse at 0x0000022CFD6D15E0>
└ <module 'xml.etree.ElementTree' from 'F:\\Anaconda3\\envs\\yolox\\lib\\xml\\etree\\ElementTree.py'>
File "F:\Anaconda3\envs\yolox\lib\xml\etree\ElementTree.py", line 1202, in parse
tree.parse(source, parser)
│ │ │ └ None
│ │ └ 'bc-064.xml'
│ └ <function ElementTree.parse at 0x0000022CFD6D0670>
└ <xml.etree.ElementTree.ElementTree object at 0x0000022CFFDB20A0>
File "F:\Anaconda3\envs\yolox\lib\xml\etree\ElementTree.py", line 584, in parse
source = open(source, "rb")
└ 'bc-064.xml'
FileNotFoundError: [Errno 2] No such file or directory: 'bc-064.xml'
- 训练电脑为windows系统还需要修改
YOLOX/yolox/data/datasets/voc.py
- 数据集格式
- 训练命令
python tools/train.py -f exps/example/yolox_voc/yolox_voc_nano.py -d 0 -b 24 --fp16 -o -c yolox_nano.pth
- 查看训练情况
cd YOLOX/YOLOX_outputs/yolox_voc_nano/tensorboard
tensorboard --logdir=./
二、转换模型
- 将pth转为onnx格式
python tools/export_onnx.py --output-name yolox_nano_self.onnx -f exps/example/yolox_voc/yolox_voc_nano.py -c YOLOX_outputs\yolox_voc_nano\best_ckpt.pth
则在项目目录下生成了yolox_nano_self.onnx
的模型文件
- 参考博客基于NCNN将YOLOX部署到Android的2.3.2~2.3.4节将onnx转为bin和param格式
三、部署到rk3308
参考博客RK3308B部署mobilenetv2_ssdlite模型进行部署,其中main.cpp换为下方脚本
#include <iostream>
#include <thread>
#include <mutex>
#include <chrono>
#include <opencv2/opencv.hpp>
#include "layer.h"
#include "net.h"
#if defined(USE_NCNN_SIMPLEOCV)
#include "simpleocv.h"
#else
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#endif
#include <float.h>
#include <stdio.h>
#include <vector>
#define YOLOX_NMS_THRESH 0.45 // nms threshold
#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
#define YOLOX_TARGET_SIZE 416 // target image size after resize, might use 416 for small model
#ifdef _WIN32
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#else // _WIN32
#include <sys/time.h>
#endif // _WIN32
using namespace std;
using namespace cv;
#ifdef _WIN32
// 单位ms
double get_current_time()
{
LARGE_INTEGER freq;
LARGE_INTEGER pc;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&pc);
return pc.QuadPart * 1000.0 / freq.QuadPart;
}
#else // _WIN32
// 单位ms
double get_current_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}
#endif // _WIN32
class YoloV5Focus : public ncnn::Layer
{
public:
YoloV5Focus()
{
one_blob_only = true;
}
virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
{
int w = bottom_blob.w;
int h = bottom_blob.h;
int channels = bottom_blob.c;
int outw = w / 2;
int outh = h / 2;
int outc = channels * 4;
top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
if (top_blob.empty())
return -100;
#pragma omp parallel for num_threads(opt.num_threads)
for (int p = 0; p < outc; p++)
{
const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
float* outptr = top_blob.channel(p);
for (int i = 0; i < outh; i++)
{
for (int j = 0; j < outw; j++)
{
*outptr = *ptr;
outptr += 1;
ptr += 2;
}
ptr += w;
}
}
return 0;
}
};
DEFINE_LAYER_CREATOR(YoloV5Focus)
struct Object
{
cv::Rect_<float> rect;
int label;
float prob;
};
struct GridAndStride
{
int grid0;
int grid1;
int stride;
};
static inline float intersection_area(const Object& a, const Object& b)
{
cv::Rect_<float> inter = a.rect & b.rect;
return inter.area();
}
static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
{
int i = left;
int j = right;
float p = faceobjects[(left + right) / 2].prob;
while (i <= j)
{
while (faceobjects[i].prob > p)
i++;
while (faceobjects[j].prob < p)
j--;
if (i <= j)
{
// swap
std::swap(faceobjects[i], faceobjects[j]);
i++;
j--;
}
}
#pragma omp parallel sections
{
#pragma omp section
{
if (left < j) qsort_descent_inplace(faceobjects, left, j);
}
#pragma omp section
{
if (i < right) qsort_descent_inplace(faceobjects, i, right);
}
}
}
static void qsort_descent_inplace(std::vector<Object>& objects)
{
if (objects.empty())
return;
qsort_descent_inplace(objects, 0, objects.size() - 1);
}
static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
{
picked.clear();
const int n = faceobjects.size();
std::vector<float> areas(n);
for (int i = 0; i < n; i++)
{
areas[i] = faceobjects[i].rect.area();
}
for (int i = 0; i < n; i++)
{
const Object& a = faceobjects[i];
int keep = 1;
for (int j = 0; j < (int)picked.size(); j++)
{
const Object& b = faceobjects[picked[j]];
if (!agnostic && a.label != b.label)
continue;
// intersection over union
float inter_area = intersection_area(a, b);
float union_area = areas[i] + areas[picked[j]] - inter_area;
// float IoU = inter_area / union_area
if (inter_area / union_area > nms_threshold)
keep = 0;
}
if (keep)
picked.push_back(i);
}
}
static void generate_grids_and_stride(const int target_w, const int target_h, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
{
for (int i = 0; i < (int)strides.size(); i++)
{
int stride = strides[i];
int num_grid_w = target_w / stride;
int num_grid_h = target_h / stride;
for (int g1 = 0; g1 < num_grid_h; g1++)
{
for (int g0 = 0; g0 < num_grid_w; g0++)
{
GridAndStride gs;
gs.grid0 = g0;
gs.grid1 = g1;
gs.stride = stride;
grid_strides.push_back(gs);
}
}
}
}
static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
{
const int num_grid = feat_blob.h;
const int num_class = feat_blob.w - 5;
const int num_anchors = grid_strides.size();
const float* feat_ptr = feat_blob.channel(0);
for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
{
const int grid0 = grid_strides[anchor_idx].grid0;
const int grid1 = grid_strides[anchor_idx].grid1;
const int stride = grid_strides[anchor_idx].stride;
// yolox/models/yolo_head.py decode logic
// outputs[..., :2] = (outputs[..., :2] + grids) * strides
// outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
float x_center = (feat_ptr[0] + grid0) * stride;
float y_center = (feat_ptr[1] + grid1) * stride;
float w = exp(feat_ptr[2]) * stride;
float h = exp(feat_ptr[3]) * stride;
float x0 = x_center - w * 0.5f;
float y0 = y_center - h * 0.5f;
float box_objectness = feat_ptr[4];
for (int class_idx = 0; class_idx < num_class; class_idx++)
{
float box_cls_score = feat_ptr[5 + class_idx];
float box_prob = box_objectness * box_cls_score;
if (box_prob > prob_threshold)
{
Object obj;
obj.rect.x = x0;
obj.rect.y = y0;
obj.rect.width = w;
obj.rect.height = h;
obj.label = class_idx;
obj.prob = box_prob;
objects.push_back(obj);
}
} // class loop
feat_ptr += feat_blob.w;
} // point anchor loop
}
static int
detect_yolox(ncnn::Net &yolox, const cv::Mat& bgr, std::vector<Object>& objects)
{
int img_w = bgr.cols;
int img_h = bgr.rows;
int w = img_w;
int h = img_h;
float scale = 1.f;
if (w > h)
{
scale = (float)YOLOX_TARGET_SIZE / w;
w = YOLOX_TARGET_SIZE;
h = h * scale;
}
else
{
scale = (float)YOLOX_TARGET_SIZE / h;
h = YOLOX_TARGET_SIZE;
w = w * scale;
}
ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);
// pad to YOLOX_TARGET_SIZE rectangle
int wpad = (w + 31) / 32 * 32 - w;
int hpad = (h + 31) / 32 * 32 - h;
ncnn::Mat in_pad;
// different from yolov5, yolox only pad on bottom and right side,
// which means users don't need to extra padding info to decode boxes coordinate.
ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
ncnn::Extractor ex = yolox.create_extractor();
ex.input("images", in_pad);
std::vector<Object> proposals;
{
ncnn::Mat out;
ex.extract("output", out);
static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0]));
std::vector<GridAndStride> grid_strides;
generate_grids_and_stride(in_pad.w, in_pad.h, strides, grid_strides);
generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals);
}
// sort all proposals by score from highest to lowest
qsort_descent_inplace(proposals);
// apply nms with nms_threshold
std::vector<int> picked;
nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);
int count = picked.size();
if (count == 0)
{
// fprintf(stderr, "%s\n", "No Object!");
return 0;
}
objects.resize(count);
for (int i = 0; i < count; i++)
{
objects[i] = proposals[picked[i]];
// adjust offset to original unpadded
float x0 = (objects[i].rect.x) / scale;
float y0 = (objects[i].rect.y) / scale;
float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
// clip
x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
objects[i].rect.x = x0;
objects[i].rect.y = y0;
objects[i].rect.width = x1 - x0;
objects[i].rect.height = y1 - y0;
}
return 1;
}
static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
{
static const char* class_names[] = {
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
"hair drier", "toothbrush"
};
cv::Mat image = bgr.clone();
for (size_t i = 0; i < objects.size(); i++)
{
const Object& obj = objects[i];
fprintf(stderr, "%s = %.1f%% at %.2f %.2f %.2f x %.2f\n", class_names[obj.label], obj.prob * 100,
obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
char text[256];
sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
int baseLine = 0;
cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
int x = obj.rect.x;
int y = obj.rect.y - label_size.height - baseLine;
if (y < 0)
y = 0;
if (x + label_size.width > image.cols)
x = image.cols - label_size.width;
cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
cv::Scalar(255, 255, 255), -1);
cv::putText(image, text, cv::Point(x, y + label_size.height),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
}
cv::imshow("image", image);
cv::waitKey(30);
}
std::mutex mtx; // 互斥锁,用于保护共享资源
cv::Mat img;
void capture_thread() {
cv::VideoCapture cap(0); // 打开默认的摄像头
if (!cap.isOpened()) {
fprintf(stderr, "Error:%s\n", "unable to open camera!!");
return;
}
while (true) {
mtx.lock(); // 加锁
cap >> img; // 读取一帧图像
if (img.empty()) {
fprintf(stderr, "Error:%s\n", "Failed to capture frame!");
continue;
}
mtx.unlock(); // 解锁
this_thread::sleep_for(chrono::milliseconds(5));
}
}
void process_thread() {
ncnn::Net yolox;
yolox.opt.use_vulkan_compute = false;
yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
if (yolox.load_param("yolox-nano.param"))
exit(-1);
if (yolox.load_model("yolox-nano.bin"))
exit(-1);
while (true) {
double start = get_current_time();
std::vector<Object> objects;
mtx.lock(); // 加锁
cv::Mat frame = img.clone();
mtx.unlock(); // 解锁
if (frame.empty()) {
this_thread::sleep_for(chrono::milliseconds(10)); // 等待10毫秒
continue;
}
detect_yolox(yolox, frame, objects);
double end = get_current_time();
double time = end - start;
fprintf(stderr, "time is %.2f ms\n", time);
}
}
int main() {
std::thread t1(capture_thread); // 创建线程1
std::thread t2(process_thread); // 创建线程2
t1.join(); // 等待线程1结束
t2.join(); // 等待线程2结束
return 0;
}
rk3308上运行速度如下
time is 1034.17 ms
cup = 96.6% at 280.54 121.89 287.21 x 300.59
time is 1056.07 ms
cup = 96.6% at 280.68 121.91 286.94 x 300.85
time is 1081.28 ms
cup = 96.6% at 280.65 122.05 287.22 x 300.58
time is 1022.46 ms
cup = 96.6% at 280.51 122.04 287.22 x 300.80
time is 1101.37 ms
time is 1027.54 ms
time is 1047.46 ms
time is 1076.67 ms