【darknet源码阅读】——Yolo_layer

最新推荐文章于 2025-05-18 10:35:14 发布

原创最新推荐文章于 2025-05-18 10:35:14 发布 · 1.2k 阅读

10 ·

CC 4.0 BY-SA版权

文章标签：

#yolo #darknet #物体检测 #源码阅读

本文深入解析YOLO目标检测算法中的Yolo层实现细节，包括前向传播过程中的预测框生成、真实框匹配策略、置信度及类别损失计算等关键步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

话不多说，先上核心【前向传播】源码 —— talk is cheap, show me the code.

void forward_yolo_layer(const layer l, network net)
{
    int i,j,b,t,n;
    memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));

#ifndef GPU
    for (b = 0; b < l.batch; ++b){
        for(n = 0; n < l.n; ++n){
            int index = entry_index(l, b, n*l.w*l.h, 0);
            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
            index = entry_index(l, b, n*l.w*l.h, 4);
            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
        }
    }
#endif

    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
    if(!net.train) return;
    float avg_iou = 0;
    float recall = 0;
    float recall75 = 0;
    float avg_cat = 0;
    float avg_obj = 0;
    float avg_anyobj = 0;
    int count = 0;
    int class_count = 0;
    *(l.cost) = 0;
    for (b = 0; b < l.batch; ++b) {
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
                    float best_iou = 0;
                    int best_t = 0;
                    for(t = 0; t < l.max_boxes; ++t){
                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
                            best_iou = iou;
                            best_t = t;
                        }
                    }
                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                    avg_anyobj += l.output[obj_index];
                    l.delta[obj_index] = 0 - l.output[obj_index];
                    if (best_iou > l.ignore_thresh) {
                        l.delta[obj_index] = 0;
                    }
                    if (best_iou > l.truth_thresh) {
                        l.delta[obj_index] = 1 - l.output[obj_index];

                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
                        if (l.map) class = l.map[class];
                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                    }
                }
            }
        }
        for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);

            if(!truth.x) break;
            float best_iou = 0;
            int best_n = 0;
            i = (truth.x * l.w);
            j = (truth.y * l.h);
            box truth_shift = truth;
            truth_shift.x = truth_shift.y = 0;
            for(n = 0; n < l.total; ++n){
                box pred = {0};
                pred.w = l.biases[2*n]/net.w;
                pred.h = l.biases[2*n+1]/net.h;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
                    best_iou = iou;
                    best_n = n;
                }
            }

            int mask_n = int_index(l.mask, best_n, l.n);
            if(mask_n >= 0){
                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);

                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
                avg_obj += l.output[obj_index];
                l.delta[obj_index] = 1 - l.output[obj_index];

                int class = net.truth[t*(4 + 1) + b*l.truths + 4];
                if (l.map) class = l.map[class];
                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);

                ++count;
                ++class_count;
                if(iou > .5) recall += 1;
                if(iou > .75) recall75 += 1;
                avg_iou += iou;
            }
        }
    }
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
}

分解如下（按行分解，不去管括号与语法问题）

第一部分，l.output输出参数做logistic函数处理

#ifndef GPU
    for (b = 0; b < l.batch; ++b){
        for(n = 0; n < l.n; ++n){
            int index = entry_index(l, b, n*l.w*l.h, 0);
            activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
            index = entry_index(l, b, n*l.w*l.h, 4);
            activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
        }
    }
#endif

在这里用到了entry_index函数，定位l.output的对应位置

static int entry_index(layer l, int batch, int location, int entry)
{
    int n =   location / (l.w*l.h);
    int loc = location % (l.w*l.h);
    return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc; //这里说明了l.output的数据各维度排列顺序。
}

从entry_index函数中我们看到，l.output的排列顺序为batch*n*(4+1+l.classes)*w*h。

于是entry指代了（4+1+l.classes）中的第几个通道。

故分别进行logistic函数处理的数为batch*n*2*w*h和batch*n*(l.classes+1)*w*h。

与论文yolo-v3对比，可知，第一部分处理后为tx和ty，第二部分处理后为类的概率值与置信度，没有处理部分为tw与th。

第二部分，把l.output解析成预测的boxes

关键代码

for (b = 0; b < l.batch; ++b) {
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
                    box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);

box_index指向位置为i,j的长度为（4+1+l.classes）的盒向量的首部。

看get_yolo_box函数

box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
{
    box b;
    b.x = (i + x[index + 0*stride]) / lw; //将坐标限定在0-1
    b.y = (j + x[index + 1*stride]) / lh;
    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w; //biases为anchor box参数
    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
    return b;
}

由于l.output的排列顺序为batch*n*(4+l.classes+1)*w*h，故需要stride=w*h来切换盒向量位置，盒向量前四个分别为x,y,w,h。

因为预测的w和h是与先验框anchor进行对比，需要anchor数据，这里为biases，配置输入的anchor box参数，在yolov3-voc.cfg中有如下参数配置：

anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326

两两一对表示anchor宽与长。

第三部分，计算真实框与预测框的iou，并寻找与每个预测框最匹配的真实框

for(t = 0; t < l.max_boxes; ++t){
                        box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
                            best_iou = iou;
                            best_t = t;
                        }

从box truth的获取过程可以看出，net.truth的数据结构为batch*90*(4+1)，即每个图片的真实框用90个boxes来表示，填不满的数据用空boxes来写。然后，对于每个预测框，找出与它iou最大的真实框。

第四部分，计算置信度的损失

int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
                    avg_anyobj += l.output[obj_index];
                    l.delta[obj_index] = 0 - l.output[obj_index];
                    if (best_iou > l.ignore_thresh) {
                        l.delta[obj_index] = 0;
                    }
                    if (best_iou > l.truth_thresh) { //这个条件永远无法满足，在yolov3-voc.cfg的配置中，truth_thresh=1
                        l.delta[obj_index] = 1 - l.output[obj_index];

                        int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
                        if (l.map) class = l.map[class];
                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
                        delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
                        box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
                        delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
                    }

这里主要关注l.delta[obj_index]与l.output[obj_index]，l.output[obj_index]是网络输出的置信度，而l.delta[obj_index]计算的是置信度损失，保存的是它的梯度，置信度损失计算分为两步，先假设没有真实框，于是l.delta[obj_index] = 0 - l.output[obj_index]，在后面阶段对于真实框位置的预测框再进行处理，在best_iou<ignore_thresh时，计算损失，若best_iou>ignore_thresh，赋值0，作为后面程序的判定条件，后面会用到，而它的置信度损失后面会计算。

在论文中置信度损失也是由两部分组成的，一个是no-obj的C，一个是obj的C，这里是no-obj的C。

第五部分，计算真实框与anchor框的iou，寻找与真实框最匹配的anchor框

for(t = 0; t < l.max_boxes; ++t){
            box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);

            if(!truth.x) break;
            float best_iou = 0;
            int best_n = 0;
            i = (truth.x * l.w);
            j = (truth.y * l.h);
            box truth_shift = truth;
            truth_shift.x = truth_shift.y = 0;
            for(n = 0; n < l.total; ++n){
                box pred = {0};
                pred.w = l.biases[2*n]/net.w;
                pred.h = l.biases[2*n+1]/net.h;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
                    best_iou = iou;
                    best_n = n;
                }
            }

对于每个真实框，寻找与它最匹配的anchor框，匹配的方式为：将真实框与anchor框对齐，计算iou，最大那个为最匹配。

其中，l.biases保存的是anchor框的长宽数据，这些数据由k-mean对长宽聚类得到，在训练时作为先验值。

第六部分，计算预测框与真实框的损失

int mask_n = int_index(l.mask, best_n, l.n);
            if(mask_n >= 0){
                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
                float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);

yolov3在三层分别输出不同anchor的匹配框，每层匹配三个anchor，此时mask_n即为当前层筛选后的anchor框，

float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
{
    box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
    float iou = box_iou(pred, truth);

    float tx = (truth.x*lw - i);
    float ty = (truth.y*lh - j);
    float tw = log(truth.w*w / biases[2*n]);
    float th = log(truth.h*h / biases[2*n + 1]);

    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
    return iou;

主要来看一下delta_yolo_box函数，因为预测值是在格点中的相对位置，而真实值是在整个图片中的相对位置故有转换关系

x(全图)=x(格点)/l.w

再看一下坐标损失

$Loss=(x_{truth}-(x_{predict}*+i(location))/l.w)^2$

求导之后有（PS：求导符号打不出来）：

$dLoss/dx_{prediction}=\tfrac{(x_{truth}*l.w-i(location))-x_{predict}}{(l.w)^2}$

w和h则正常求导直接计算就可以

在实现的过程中，损失前面的系数实际是自己调节的，忽略了1/(l.w)^2，而是乘了一个因子scale=2-w*h，为了平衡大边框与小边框的损失比例，因为在计算边框损失的时候，很明显大框数值会高于小框。

第七部分，计算分类损失

i = (truth.x * l.w);
j = (truth.y * l.h);

i,j为整型，而truth.x为浮点数，这种强制转换也即取整，即i,j表示的是真实框的格点位置，根据中心点确定真实框处于哪个格点

l.delta[obj_index] = 1 - l.output[obj_index];

int class = net.truth[t*(4 + 1) + b*l.truths + 4];
                if (l.map) class = l.map[class];
                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
                delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);

在前面是对于每个真实框，寻找出与它最匹配的anchor，再通过cfg里面的mask设置，来筛选这层要使用的anchor。

最后再通过真实框的标签类，定位它所处的类位置。

于是对于每一个真实框，通过匹配的那个anchor来定位anchor层位置，通过中心点取整来定位它的格点位置，通过标签定位类位置。那么一张图片的真实框的对应的通过网络得到的预测框位置也就确定了，因为预测值的数据结构为batch*anchor*(4+1+classes)*(w*h)，任意一个维度都确定了。

第一行是第二部分置信度损失，是对于匹配到的预测框所在置信度损失的计算，因为在前面计算预测框时，只计算了best_iou<ignore_thresh部分，而这部分为obj部分的C。

看delta_yolo_class函数

void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
{
    int n;
    if (delta[index]){
        delta[index + stride*class] = 1 - output[index + stride*class];
        if(avg_cat) *avg_cat += output[index + stride*class];
        return;
    }
    for(n = 0; n < classes; ++n){
        delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
        if(n == class && avg_cat) *avg_cat += output[index + stride*n];
    }
}

对于每一个真实框所对应的anchor与位置，计算classes所对应的预测数值的损失。

$Loss=(p_{truth}(0 / 1)-p_{prediction})^2$

上面提到的best_iou>ignore_thresh赋值为0，作为这里的条件，如果不为0，即iou是小于ignore_thresh的，那么类损失只计算真实类的损失，而不计算其他类，而如果iou大于ignore_thresh，则要计算所有类的损失。这个判断条件告诉我们，在训练的时候，并不是根据置信度来判断预测框有没有匹配到真实框，因为置信度本身也是预测的，而是根据预测框与真实框的iou来判断这个预测框有没有匹配真实框，以它为条件，如果匹配到了，就要计算它的分类损失，如果没匹配到，只对真实类那个数据进行计算，而置信度，只是一个输出预测值，训练时不作其他用途。

第八部分，输出项

printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);

主要变量有：net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count

++count;
++class_count;
if(iou > .5) recall += 1;
if(iou > .75) recall75 += 1;
avg_iou += iou;

结合实际跑程序时候的输出得知，net.index为batch次数输出，即此时跑的是第几个batch。

count在真实值匹配的anchor处于此层输出的anchor时才++，那么这个count表示的就不是真实框的数量，而是表示与这个层的anchor匹配的真实框数量，这个数量要小于真实框，而在计算的时候，也仅仅计算了这几个anchor所对应的层的导数。

avg_iou是iou的求和后的结果，而iou是由预测框与真实框得到，当然这里的真实框已经经过了筛选，更详细地说，iou是经过mask匹配筛选后的真实框，与其中心位置所在的预测框作iou计算得到，于是avg_iou就代表着真实框与模型的平均重叠程度。

avg_cat是在真实框的对应类的位置的预测值的和，也即预测类的真实类的匹配程度。

avg_obj与avg_anyobj的计算方式类似,avg_obj是计算与真实框匹配的预测框的置信度，而avg_anyobj是遍历batch*n(anchor)*w*h数量的置信度，并相加，故它表示的是模型预测的的所有预测框的平均置信度，故最终输出的avg_obj的值应该大于0.5，而avg_anyobj的值应该很小，一般小于1%。

最后，recall在iou>0.5时++，而recall75在iou>0.75时++，意味着，如果把iou阈值设为0.5，如果超过阈值，则认为它检测出来了，反之则没检测出来，那么这个变量就代表着召回率，即检测出来的概率（检测出的数目/真实框数目），这两个只是阈值不同，含义一样。在训练的时候，常常可以看到recall一般接近于1，而recall75则变化不定，一般0.2-0.7之间居多。

总结

yolo_layer前向传播的过程主要为：对于预测值，除了预测框的w和h预测值，其他的参数（x，y，置信度，类预测）都进行logistic函数处理，使得其值域处于0-1，值得注意的是类预测中，v2版本用的是softmax loss，类标签互斥，v3版本用的是均方误差（MSE），是把每一类都预测（是这类/不是这类），类标签可以共存，即一个物体可以拥有多个标签。再把预测框参数解析为x,y,w,h，此时预测框有anchor*w*h个，计算其与真实框的iou，并通过iou，为每个预测框寻找最匹配的真实框，通过预测框与它最匹配的真实框的iou与阈值比较，如果小于阈值，则判断它不匹配真实框。而后再计算与真实框最匹配的anchor框，再通过mask筛选此层需要的anchor框，通过中心位置定位预测框位置，这个位置预测框则判定为匹配。如果预测框匹配，则计算它的全部类的类损失，如果不匹配，只计算它的对应于真实类的类损失。对于匹配部分，还要计算预测框损失。而置信度损失由两部分组成，匹配部分和不匹配部分，具体地说是，一部分由与预测框与与它最匹配的真实框的iou小于阈值时，赋值认为真实置信度为0，计算两者差得到，另一部分由真实框与它中心位置与它最匹配的anchor框对应位置的经mask筛选后的预测框，认为真实置信度为1，计算两者差得到，两者也许不存在交集，但一定存在都不属于两者的预测框，它只是明确地告诉模型哪些可以认为是真的，哪些是假的，其他的不管。这个过程中要注意，预测框与真实框匹不匹配，用了两种判定方式，而置信度，并没有在其中起任何作用，它仅仅是作为一个输出计算损失。而前向传播的主要目的，也是为了分别计算置信度损失，预测框损失和分类损失。

码字不易——点个关注（比心）