YOLO-tensorflow代码解析二（yolo_net.py）

最新推荐文章于 2025-03-10 17:28:39 发布

原创

最新推荐文章于 2025-03-10 17:28:39 发布 · 415 阅读

2 ·

CC 4.0 BY-SA版权

本文深入解析YOLO在Tensorflow中的实现，主要聚焦于yolo_net.py文件，详细探讨其代码逻辑和关键部分，帮助读者理解YOLO网络的构建过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import numpy as np
import tensorflow as tf
import yolo.config as cfg

slim = tf.contrib.slim


class YOLONet(object):

    def __init__(self, is_training=True):
        self.classes = cfg.CLASSES  #有哪些种类
        self.num_class = len(self.classes)  #一共20类
        self.image_size = cfg.IMAGE_SIZE   #图片大小448*448
        self.cell_size = cfg.CELL_SIZE     #feature map大小 7*7
        self.boxes_per_cell = cfg.BOXES_PER_CELL  #feature map中每个cell包含两个box
        self.output_size = (self.cell_size * self.cell_size) *\
            (self.num_class + self.boxes_per_cell * 5)          #输出长度 7*7*（20+2*5）
        self.scale = 1.0 * self.image_size / self.cell_size     #缩放比
        self.boundary1 = self.cell_size * self.cell_size * self.num_class
        self.boundary2 = self.boundary1 +\
            self.cell_size * self.cell_size * self.boxes_per_cell

        self.object_scale = cfg.OBJECT_SCALE  #值为1，存在目标的因子
        self.noobject_scale = cfg.NOOBJECT_SCALE   #值为1，不存在目标的因子
        self.class_scale = cfg.CLASS_SCALE     #类别损失函数的因子
        self.coord_scale = cfg.COORD_SCALE     #坐标损失函数的因子

        self.learning_rate = cfg.LEARNING_RATE
        self.batch_size = cfg.BATCH_SIZE  #每个批次45张图片
        self.alpha = cfg.ALPHA

        # [2,7,7] -> [7,7,2]
        self.offset = np.transpose(np.reshape(np.array(
            [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
            (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))

        # 构建网络图,返回预测结果
        self.images = tf.placeholder(
            tf.float32, [None, self.image_size, self.image_size, 3],
            name='images')
        self.logits = self.build_network(
            self.images, num_outputs=self.output_size, alpha=self.alpha,
            is_training=is_training)

        if is_training:
            self.labels = tf.placeholder(
                tf.float32,
                [None, self.cell_size, self.cell_size, 5 + self.num_class]) #训练时，实际标签的维度为25
            self.loss_layer(self.logits, self.labels)
            self.total_loss = tf.losses.get_total_loss()
            tf.summary.scalar('total_loss', self.total_loss)

    # 构造网络图
    def build_network(self,
                      images,
                      num_outputs,
                      alpha,
                      keep_prob=0.5,
                      is_training=True,
                      scope='yolo'):
        with tf.variable_scope(scope):
            with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                activation_fn=leaky_relu(alpha),
                weights_regularizer=slim.l2_regularizer(0.0005),
                weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
            ):
                net = tf.pad(
                    images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),   #对输入数据的宽高进行填充，batch_size和channel不做填充
                    name='pad_1')
                net = slim.conv2d(
                    net, 64, 7, 2, padding='VALID', scope='conv_2')# conv：64个7x7的卷积核，以2为步伐进行卷积,out:224x224*64
                net = slim.max_pool2d(net, 2, padding='SAME', scope='pool_3')# pool:最大池化kernel=2,stride = 2,out:112x112x64
                net = slim.