tensorflow之yolo1代码讲解

yolov1已经出来很久了,起初公布的代码作者采用的是自己实现的网络框架darknet,该网络框架是基于c语言实现。尽管现在有着各种各样的框架,不过个人感觉tensorflow在使用方面依旧有着很强的优势,主要体现在扩展性方面。不过因人而异吧,毕竟只是一个工具而已。网上其他博客也有对这份代码的讲解,不过个人感觉依旧有些不到位,所以决定还是结合自己的理解讲解下。下面的代码主要是基于tensorflow实现的yolov1,基本理论可以查看:https://blog.youkuaiyun.com/yongjiankuang/article/details/71057288

首先是配置文件config.py,主要包含了网络的一些基本参数信息

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 28 22:31:05 2018

@author: Administrator

config.py
"""
import os

DATA_PATH = 'data'

PASCAL_PATH = os.path.join(DATA_PATH,'pascal_voc')

CACHE_PATH = os.path.join(PASCAL_PATH,'cache')

OUTPUT_DIR = os.path.join(PASCAL_PATH,'output')

WEIGHTS_DIR = os.path.join(PASCAL_PATH,'weights')

WEIGHTS_FILE = None


CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
           'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
           'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
           'train', 'tvmonitor']


FLIPPED = True



#model parameter
IMAGE_SIZE = 448  #输入图片设置大小

CELL_SIZE = 7

BOXES_PER_CELL = 2

ALPHA = 0.1

DISP_CONSOLE = False

OBJECT_SCALE = 1.0 #目标物体的代价因子
NOOBJECT_SCALE= 1.0 #非目标物体的代价因子
CLASS_SCALE = 2.0 #类别的代价因子
COORD_SCALE = 5.0 #坐标的代价因子


#
# solver parameter
#

GPU = False

LEARNING_RATE = 0.0001

DECAY_STEPS = 30000

DECAY_RATE = 0.1

STAIRCASE = True

BATCH_SIZE = 45

MAX_ITER = 15000

SUMMARY_ITER = 10

SAVE_ITER = 1000


#
# test parameter
#

THRESHOLD = 0.2

IOU_THRESHOLD = 0.5

yolo_net.py主要是网络结构的声明

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 28 22:32:26 2018

@author: kuangyongjian
"""
import numpy as np
import tensorflow as tf
import yolo.config as cfg

slim = tf.contrib.slim

class YOLONet(object):
	
	def __init__(self, is_training=True):
		self.classes = cfg.CLASSES #目标类别
		self.num_class = len(self.classes) #类别数目
		self.image_size = cfg.IMAGE_SIZE #输入图像的大小
		self.cell_size = cfg.CELL_SIZE #cell的大小
		self.boxes_per_cell = cfg.BOXES_PER_CELL #每个cell负责的box数目
		self.output_size = (self.cell_size * self.cell_size) * \
			(self.num_class + self.boxes_per_cell * 5) #输出数据的维度:7*7*(20 + 2 * 5) = 1470
			
		self.scale = 1.0 * self.image_size / self.cell_size #缩放比
		
		#7*7个cell属于20个物体类别的概率 + 98个box  边界
		self.boundary1 = self.cell_size * self.cell_size * self.num_class 
		self.boundary2 = self.boundary1 +\
			self.cell_size * self.cell_size * self.boxes_per_cell
			
		
		self.object_scale = cfg.OBJECT_SCALE #值为1,存在目标的因子
		self.noobject_scale = cfg.NOOBJECT_SCALE #值为1,不存在目标的因子
		self.class_scale = cfg.CLASS_SCALE #类别损失函数的因子
		self.coord_scale = cfg.COORD_SCALE #坐标损失函数的因子
		
		self.learning_rate = cfg.LEARNING_RATE #学习速率
		self.batch_size = cfg.BATCH_SIZE #批次大小
		self.alpha = cfg.ALPHA #alpha
		
		#[2,7,7] -> [7,7,2]
		self.offset = np.transpose(np.reshape(np.array(
            [np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell),
            (self.boxes_per_cell, self.cell_size, self.cell_size)), (1, 2, 0))
		
		tf.reset_default_graph()	
		#输入变量 448x448x3 
		self.images = tf.placeholder(
            tf.float32, [None, self.image_size, self.image_size, 3],
            name='images')
		
		#构建网络图,返回预测结果
		self.logits = self.build_network(self.images, num_outputs=self.output_size, alpha=self.alpha,is_training=is_training)
						
		if is_training:
			#训练时,实际标签的维度为25
		    self.labels = tf.placeholder(
				tf.float32,
				[None, self.cell_size, self.cell_size, 5 + self.num_class])
		    self.loss_layer(self.logits, self.labels)
		    self.total_loss = tf.losses.get_total_loss()
		    tf.summary.scalar('total_loss', self.total_loss)
	
	#构造网络图
	def build_network(self,
                      images,
                      num_outputs,
                      alpha,
                      keep_prob=0.5,
                      is_training=True,
                      scope='yolo'):
		
		#tf.reset_default_graph()
		with tf.variable_scope(scope):
			with slim.arg_scope(
                [slim.conv2d, slim.fully_connected],
                activation_fn=leaky_relu(alpha),
                weights_regularizer=slim.l2_regularizer(0.0005),
                weights_initializer=tf.truncated_normal_initializer(0.0, 0.01)
            ):
				net = tf.pad(\
                images, np.array([[0, 0], [3, 3], [3, 3], [0, 0]]),
                name='pad_1') #对输入数据的宽高进行填充,batch_size和channel不做填充
				net = slim.conv2d(\
                net, 64, 7, 2, padding='VALID', scope='conv_2') #conv:64个7x7的卷积核,以2为步伐进行滤波,out:224x224*64
				net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_3') #pool:最大池化kernel=2,stride = 2,out:112x112x64
				net = slim.conv2d(net,192,3,scope = 'conv_4') #conv: num_kernel = 192,kernel_size=3,out:112x112x192 
				net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_5') #pool: kernel_size = 2,stride = 2,out:56x56x192
				net = slim.conv2d(net,128,1,scope = 'conv_6') #conv: num_kernel = 128,kernel_size = 1,out:56x56x128
				net = slim.conv2d(net,256,3,scope = 'conv_7') #conv: num_kernel = 256,kernel_size = 3,out:56x56x256
				net = slim.conv2d(net,256,1,scope = 'conv_8') #conv: num_kernel = 256,kernel_size = 1,out:56x56x256
				net = slim.conv2d(net,512,3,scope = 'conv_9') #conv: num_kernel = 512,kernel_size = 3,out:56x56x512
				net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_10') #pool: kernel_size = 2,stride = 2,out:28x28x512
				net = slim.conv2d(net,256,1,scope = 'conv_11') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
				net = slim.conv2d(net,512,3,scope = 'conv_12') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
				net = slim.conv2d(net,256,1,scope = 'conv_13') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
				net = slim.conv2d(net,512,3,scope = 'conv_14') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
				net = slim.conv2d(net,256,1,scope = 'conv_15') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
				net = slim.conv2d(net,512,3,scope = 'conv_16') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
				net = slim.conv2d(net,256,1,scope = 'conv_17') #conv: num_kernel = 256,kernel_size = 1,out:28x28x256
				net = slim.conv2d(net,512,3,scope = 'conv_18') #conv: num_kernel = 512,kernel_size = 3,out:28x28x512
				net = slim.conv2d(net,512,1,scope = 'conv_19') #conv: num_kernel = 512,kernel_size = 1,out:28x28x512
				net = slim.conv2d(net,1024,3,scope = 'conv_20') #conv: num_kernel = 1024,kernel_size = 3,out:28x28x1024
				net = slim.max_pool2d(net,2,padding = 'SAME',scope = 'pool_21') #pool:kernel_size = 2,stride = 2,out: 14x14x1024
				net = slim.conv2d(net,512,1,scope = 'conv_22') #conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
				net = slim.conv2d(net,1024,3,scope = 'conv_23') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
				net = slim.conv2d(net,512,1,scope = 'conv_24') #conv: num_kernel = 512,kernel_size = 1,out: 14x14x512
				net = slim.conv2d(net,1024,3,scope = 'conv_25') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
				net = slim.conv2d(net,1024,3,scope = 'conv_26') #conv: num_kernel = 1024,kernel_size = 3,out:14x14x1024
				net = tf.pad(net,np.array([[0,0],[1,1],[1,1],[0,0]]),name = 'pad_27') #对特征图进行填充
				net = slim.conv2d(net,1024,3,2,padding = 'VALID',scope = 'conv_28') #conv: num_kernel = 1024, kernel_size = 3,stride = 2,out:7x7x1024
				net = slim.conv2d(net,1024,3,scope = 'conv_29') #conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
				net = slim.conv2d(net,1024,3,scope = 'conv_30') #conv: num_kernel = 1024,kernel_size = 3,out:7x7x1024
				net = tf.transpose(net,[0,3,1,2],name = 'trans_31') #[batchsize channel 7 7]
				net = slim.flatten(net,scope = 'flat_32') # 
				net = slim.fully_connected(net,512,scope = 'fc_33') #1x512
				net = slim.fully_connected(net,4096,scope = 'fc_34') #1x4096
				net = slim.dropout(
					net,keep_prob = keep_prob,is_training = is_training,
					scope = 'dropout_35') #dropout
				net = slim.fully_connected(net,num_outputs,activation_fn = None,scope = 'fc_36') #1470
					
			return net
		
	def calc_iou(self,boxes1,boxes2,scope = 'iou'):
		
		with tf.variable_scope(scope):
			#transform (x_center,y_center,w,h) to (x1,y1,x2,y2)
			boxes1_t = tf.stack([boxes1[...,0] - boxes1[...,2] / 2.0, #x - w / 2
								boxes1[...,1] - boxes1[...,3] / 2.0,  #y - h / 2
								boxes1[...,0] + boxes1[...,2] / 2.0,  #x + w / 2
								boxes1[...,1] + boxes1[...,3] / 2.0], #y + h / 2
								axis = -1)
								
			boxes2_t = tf.stack([boxes2[...,0] - boxes2[...,2] / 2.0,
								boxes2[...,1] - boxes2[...,3] / 2,
								boxes2[...,0] + boxes2[...,2] / 2,
								boxes2[...,1] + boxes2[...,3] / 2],
								axis = -1)
	
			#calculate the left up point and right down point
			lu = tf.maximum(boxes1_t[...,:2],boxes2_t[...,:2])
			rd = tf.minimum(boxes1_t[...,2:],boxes2_t[...,2:])
			
			#intersection
			intersection = tf.maximum(0,rd - lu)
			inter_square = intersection[...,0] * intersection[...,1]
			
			#calculate the boxs1 square and box2 square
			square1 = boxes1[...,2] * boxes1[...,3]
			square2 = boxes2[...,2] * boxes2[...,3]
			
			#计算并集
			union_square = tf.maximum(square1 + square2 - inter_square,1e-10)
		return tf.clip_by_value(inter_square / union_square,0.0,1.0)
		
		
	def loss_layer(self,predicts,labels,scope = 'loss_layer'):
		with tf.variable_scope(scope):
			predict_classes = tf.reshape(predicts[:,:self.boundary1],[self.batch_size,self.cell_size,self.cell_size,self.num_class])#网络输出端类别数据
			predict_scales = tf.reshape(predicts[:,self.boundary1:self.boundary2],[self.batch_size,self.cell_size,self.cell_size,self.boxes_per_cell])#网络输出端置信度
			predict_boxes = tf.reshape(predicts[:,self.boundary2:],[self.batch_size,self.cell_size,self.cell_size,self.boxes_per_cell,4])#网络输出端box数据
			
			#label存储信息的格式:[response boxes classes]
			response = tf.reshape(labels[...,0],[self.batch_size,self.cell_size,self.cell_size,1])#标签中置信度
			boxes = tf.reshape(labels[...,1:5],[self.batch_size,self.cell_size,self.cell_size,1,4])#标签中box
			boxes = tf.tile(boxes,[1,1,1,self.boxes_per_cell,1] / self.image_size)#将label中的box格式转换为与predict中box对应的格式
			classes = labels[...,5:]#标签中的类别信息
			
			offset = tf.reshape(tf.constant(self.offset,dtype = tf.float32),[1,self.cell_size,self.cell_size,self.boxes_per_cell])
			offset = tf.tile(offset,[self.batch_size,1,1,1])#将offset复制batch_size份
			offset_tran = tf.transpose(offset,(0,2,1,3))
			predict_boxes_tran = tf.stack([(predict_boxes[...,0] + offset) / self.cell_size,
									(predict_boxes[...,1] + offset_tran) / self.cell_size,
									tf.square(predict_boxes[...,2]),
									tf.square(predict_boxes[...,3])],axis = -1)
									
			iou_predict_truth = self.calc_iou(predict_boxes_tran,boxes)
			
			#calculate I tensor [BATCH_SIZE, CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
			object_mask = tf.reduce_max(iou_predict_truth,3,keep_dims = True) #取第三个维度上的最大值,并保持第三个维度的大小
			object_mask = tf.cast((iou_predict_truth >= object_mask),tf.float32) * response #表示包含目标物体的位置为1,其他位置为0
			
			#calculate no_I tensor [CELL_SIZE, CELL_SIZE, BOXES_PER_CELL]
			noobject_mask = tf.ones_like(object_mask,dtype = tf.float32) - object_mask #计算无目标物体的掩膜
			
			boxes_tran = tf.stack([
			boxes[...,0] * self.cell_size - offset,
			boxes[...,1] * self.cell_size - offset_tran,
			tf.sqrt(boxes[...,2]),
			tf.sqrt(boxes[...,3])],axis = -1)
			
			#class_loss 类别损失函数
			class_delta = response * (predict_classes - classes)  #有目标情况下,类别误差
			class_loss = tf.reduce_mean(
				tf.reduce_sum(tf.square(class_delta),axis = [1,2,3]),name = 'class_loss') * self.class_scale
				
			#object_loss 目标物体confidence的损失
			object_delta = object_mask * (predict_scales - iou_predict_truth)
			object_loss = tf.reduce_mean(
				tf.reduce_sum(tf.square(object_delta),axis = [1,2,3]),name = 'object_loss') * self.object_scale
			
			#noobject_delta
			noobject_delta = noobject_mask * predict_scales
			noobject_loss = tf.reduce_mean(
				tf.reduce_sum(tf.square(noobject_delta),axis = [1,2,3]),
				name = 'noobject_loss') * self.noobject_scale
				
			#coord_loss
			coord_mask = tf.expand_dims(object_mask,4) #先扩维
			boxes_delta = coord_mask * (predict_boxes - boxes_tran)
			coord_loss = tf.reduce_mean(
				tf.reduce_sum(tf.square(boxes_delta),axis = [1,2,3,4]),
				name = 'coord_loss') * self.coord_scale
			
			#total loss
			tf.losses.add_loss(class_loss)
			tf.losses.add_loss(object_loss)
			tf.losses.add_loss(noobject_loss)
			tf.losses.add_loss(coord_loss)
			
			tf.summary.histogram('boxes_delta_x',boxes_delta[...,0])
			tf.summary.histogram('boxes_delta_y',boxes_delta[...,1])
			tf.summary.histogram('boxes_delta_w',boxes_delta[...,2])
			tf.summary.histogram('boxes_delta_h',boxes_delta[...,3])
			tf.summary.histogram('iou',iou_predict_truth)

#leaky 激活函数			
def leaky_relu(alpha):
	def op(inputs):
		return tf.nn.leaky_relu(inputs,alpha = alpha,name = 'leaky_relu')
	return op

pascal_voc.py主要是voc数据信息转换为网络训练可读数据

"""
pascal_voc:主要功能
对图像数据进行归一化,同时获取相应的标签数据
"""
import os
import xml.etree.ElementTree as ET
import numpy as np
import cv2
import pickle
import copy
import yolo.config as cfg


class pascal_voc(object):
	def __init__(self,phase,rebuild = False):
		self.devkil_path = os.path.join(cfg.PASCAL_PATH,'VOCdevkit')
		self.data_path = os.path.join(self.devkil,'VOC2007')
		self.cache_path = cfg.CACHE_PATH
		self.batch_size = cfg.BATCH_SIZE
		self.image_size = cfg.IMAGE_SIZE
		self.cell_size = cfg.CELL_SIZE
		self.classes = cfg.CLASSES
		self.class_to_ind = dict(zip(self.classes,range(len(self.classes))))
		self.flipped = cfg.FLIPPED
		self.phase = phase
		self.rebuild = rebuild
		self.cursor = 0
		self.epoch = 1
		self.gt_labels = None
		self.prepare()
	
	#批量读取图片和图片的标签信息
	def get(self):
		images = np.zeros((self.batch_size,self.image_size,self.image_size,3))
		labels = np.zeros((self.batch_size,self.cell_size,self.cell_size,25))
		count = 0
		while count < self.batch_size:
			imname = self.gt_labels[self.cursor]['imname']
			flipped = self.gt_labels[self.cursor]['flipped']
			images[count,:,:,:] = self.image_read(imname,flipped)
			labels[count,:,:,:] = self.gt_labels[self.cursor]['label']
			count += 1
			self.cursor += 1
			if self.cursor >= len(self.gt_labels):
				np.random.shuffle(self.gt_labels)
				self.cursor = 0
				self.epoch += 1
				
		return images,labels
	
	#对图片数据进行格式转换以及归一化等处理
	def image_read(self,imname,flipped = False):
		image = cv2.imread(imname)
		image = cv2.resize(image,(self.image_size,self.image_size))
		image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB).astype(np.float32)
		image = (image / 255.0) * 2.0 - 1.0 #数据归一化到[-1,1]
		
		if flippedd:
			image = image[:,::-1,:]#对数据的x维度进行镜像处理
		return image
	
	#准备好数据的标签信息
	def prepare(self):
		gt_labels = self.load_labels()
		#如果要做镜像处理,则相应的label信息做相应的处理
		if self.flipped:
			print('Appending horizontally-flipped training examples...')
			gt_labels_cp = copy.deepcopy(gt_labels)
			for idx in range(len(gt_labels_cp)):
				gt_labels_cp[idx]['flipped'] = True
				gt_labels_cp[idx]['label'] = gt_labels_cp[idx]['label'][:,::-1,:] #label信息水平方向进行镜像处理
				for i in range(self.cell_size):
					for j in range(self.cell_size):
						if gt_labels_cp[idx]['label'][i,j,0] == 1:
							gt_labels_cp[idx]['label'][i,j,1] = \
								self.image_size - 1 - \
								gt_labels_cp[idx]['label'][i,j,1]
			
			gt_labels += gt_labels_cp
		np.random.shuffle(gt_labels)#打乱数据信息
		self.gt_labels = gt_labels
		return gt_labels
		
		
	#gt_label存储图片路径信息,标签信息,是否镜像信息	
	def load_labels(self):
		cache_file = os.path.join(self.cache_path,'pascal_' + self.phase + '_gt_labels.pkl')
		
		if os.path.isfile(cache_file) and not self.rebuild:
			print('Loading gt_labels from: ' + cache_file)
			with open(cache_file,'rb') as f:
				gt_labels = pickle.load(f)
			return gt_labels
			
		print('processing gt_labels from: ' + self.data_path)
		
		if not os.path.exists(self.cache_path):
            os.makedirs(self.cache_path)
		
		if self.phase == 'train':
			txtname = os.path.join(self.data_path,'ImageSets','Main','trainval.txt')
		else:
			txtname = os.path.join(self.data_path,'ImageSets','Main','test.txt')
			
		with open(txtname,'r') as f:
			self.image_index = [x.strip() for x in f.readlines()]
			
		gt_labels = []
		for index in self.image_index:
			label,num = self.load_pascal_annotation(index)
			if num == 0:
				continue
			imname = os.path.join(self.data_path,'JPEGImages',index + '.jpg')
			gt_labels.append({'imname':imname,
								'label':label,
								'flipped':False})
								
		print('Saving gt_labels to:' + cache_file)
		with open(cache_file,'wb') as f:
			pickle.dump(gt_labels,f)
		return gt_labels
			
		
		
	#获取每一张图像中目标物体的label信息	
	def load_pascal_annotation(self,index):
		
		imname = os.path.join(self.data_path,'JPEGImages',index + '.jpg')
		im = cv2.imread(imname)
		
		#缩放比
		h_ratio = 1.0 * self.image_size / im.shape[0]
		w_ratio = 1.0 * self.image_size / im.shape[1]
		
		label = np.zeros((self.cell_size,self.cell_size,25))
		filename = os.path.join(self.data_path,'Annotations',index + '.xml')
		tree = ET.parse(filename)
		objs = tree.findall('object')#获取目标物体的信息
		
		for obj in objs:
			bbox = obj.find('bndbox')#目标物体的box
			#对目标物体的box进行相应的缩放处理
			x1 = max(min((float(bbox.find('xmin').text) - 1) * w_ratio,self.image_size - 1),0)
			y1 = max(min((float(bbox.find('ymin').text) - 1) * h_ratio,self.image_size - 1),0)
			x2 = max(min((float(bbox.find('xmax').text) - 1) * w_ratio,self.image_size - 1),0)
			y2 = max(min((float(bbox.find('ymax').text) - 1) * h_ratio,self.image_size - 1),0)
			
			cls_ind = self.class_to_ind[obj.find('name').text.lower().strip()]
			boxes = [(x2 + x1) / 2.0, (y2 + y1) / 2.0, x2 - x1, y2 - y1]#box转换为centerx,centery,w,h
			x_ind = int(boxes[0] * self.cell_size / self.image_size)#将图像坐标转换为以7为单元的坐标
			y_ind = int(boxes[1] * self.cell_size / self.image_size)
			if label[y_ind,x_ind,0] == 1:
				continue
			label[y_ind,x_ind,0] = 1
			label[y_ind,x_ind,1:5] = boxes
			label[y_ind,x_ind,5 + cls_ind] = 1
			
		return label,len(objs)
	

timer.py主要用于打印代码运行过程中耗时信息

import time
import datetime


class Timer(object):
    '''
    A simple timer.
    '''

    def __init__(self):
        self.init_time = time.time()
        self.total_time = 0.
        self.calls = 0
        self.start_time = 0.
        self.diff = 0.
        self.average_time = 0.
        self.remain_time = 0.

    def tic(self):
        # using time.time instead of time.clock because time time.clock
        # does not normalize for multithreading
        self.start_time = time.time()

    def toc(self, average=True):
        self.diff = time.time() - self.start_time
        self.total_time += self.diff
        self.calls += 1
        self.average_time = self.total_time / self.calls
        if average:
            return self.average_time
        else:
            return self.diff

    def remain(self, iters, max_iters):
        if iters == 0:
            self.remain_time = 0
        else:
            self.remain_time = (time.time() - self.init_time) * \
                (max_iters - iters) / iters
        return str(datetime.timedelta(seconds=int(self.remain_time)))

test.py是网络的测试程序

import os
import cv2
import argparse
import numpy as np
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer

class Detector(object):

	def __init__(self,net,weights_file):
		self.net = net#加载网络图
		self.weights_file = weights_file#加载权重
		self.classes = cfg.CLASSES#加载类别
		self.num_class = len(self.classes)#一共有多少类
		self.image_size = cfg.IMAGE_SIZE#图片的大小
		self.cell_size = cfg.CELL_SIZE #没个cell的大小
		self.boxes_per_cell = cfg.BOXES_PER_CELL #没个cell有多少个boxes
		self.threshold = cfg.THRESHOLD
		self.iou_threshold = cfg.IOU_THRESHOLD
		self.boundary1 = self.cell_size * self.cell_size * self.num_class #类别向量维度的边界
		self.boundary2 = self.boundary1 + \
			self.cell_size * self.cell_size * self.boxes_per_cell #box向量维度的边界
		self.sess= tf.Session() #创建会话
		self.sess.run(tf.global_variables_initializer()) #图中所有变量的初始化
		
		#加载模型文件
		print('Restoring weights from: ' + self.weights_file)
		self.saver = tf.train.Saver()
		self.saver.restore(self.sess,self.weights_file)
        
	def draw_result(self, img, result):
		for i in range(len(result)):
			x = int(result[i][1])
			y = int(result[i][2])
			w = int(result[i][3] / 2)
			h = int(result[i][4] / 2)
			cv2.rectangle(img, (x - w, y - h), (x + w, y + h), (0, 255, 0), 2)
			cv2.rectangle(img, (x - w, y - h - 20),
                          (x + w, y - h), (125, 125, 125), -1)
			lineType = cv2.LINE_AA if cv2.__version__ > '3' else cv2.CV_AA
			cv2.putText(
                img, result[i][0] + ' : %.2f' % result[i][5],
                (x - w + 5, y - h - 7), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                (0, 0, 0), 1, lineType)
	
	def detect(self,img):
		img_h,img_w,_ = img.shape
		inputs= cv2.resize(img,(self.image_size,self.image_size))
		inputs = cv2.cvtColor(inputs,cv2.COLOR_BGR2RGB).astype(np.float32) #bgr -> rgb
		inputs = (inputs / 255.0) * 2.0 - 1.0
		inputs = np.reshape(inputs,(1,self.image_size,self.image_size,3))
		
		result = self.detect_from_cvmat(inputs)[0]
		
		for i in range(len(result)):
			result[i][1] *= (1.0 * img_w / self.image_size)
			result[i][2] *= (1.0 * img_h / self.image_size)
			result[i][3] *= (1.0 * img_w / self.image_size)
			result[i][4] *= (1.0 * img_h / self.image_size)
		
		return result
	
	def detect_from_cvmat(self,inputs):
		net_output = self.sess.run(self.net.logits,
							feed_dict = {self.net.images:inputs})
							
		results = []
		for i in range(net_output.shape[0]):
			results.append(self.interpret_output(net_output[i]))
			
		return results
	
	#对网络出来的结果筛选出符合条件的box,输出box信息以及对应的prob
	def interpret_output(self,output):
		probs = np.zeros((self.cell_size,self.cell_size,
						self.boxes_per_cell,self.num_class)) #所有box对应每个类别的概率
		
		class_probs = np.reshape(
			output[0:self.boundary1],
			(self.cell_size,self.cell_size,self.num_class)) #输出时,没个cell返回一个类别,[7,7,20]
			
		scales = np.reshape(
			output[self.boundary1:self.boundary2],
			(self.cell_size,self.cell_size,self.boxes_per_cell)) #每个cell的confidence [7,7,2]
			
		boxes = np.reshape(
			output[self.boundary2:],
			(self.cell_size,self.cell_size,self.boxes_per_cell,4)) #输出的坐标信息, [7,7,2,4]
		
		offset = np.array(
			[np.arange(self.cell_size)] * self.cell_size * self.boxes_per_cell)

		offset = np.transpose(
			np.reshape(
			offset,
			[self.boxes_per_cell,self.cell_size,self.cell_size]),
			(1,2,0)) #offset [2,7,7] -> [7,7,2]
			
		boxes[:,:,:,0] += offset
		boxes[:,:,:,1] += np.transpose(offset,(1,0,2))
		boxes[:,:,:,:2] = 1.0 * boxes[:,:,:,0:2] / self.cell_size
		boxes[:,:,:,2:] = np.square(boxes[:,:,:,2:])
		
		boxes *= self.image_size #将目标坐标相对cell的便宜反映到448图像上
		
		for i in range(self.boxes_per_cell):
			for j in range(self.num_class):
				probs[:,:,i,j] = np.multiply(
					class_probs[:,:,j],scales[:,:,i])  #class * scales
					
		filter_mat_probs = np.array(probs >= self.threshold,dtype = 'bool') #probs >= thre,则filter_mat_prob = 1
		filter_mat_boxes = np.nonzero(filter_mat_probs)
		boxes_filtered = boxes[filter_mat_boxes[0],
								filter_mat_boxes[1],
								filter_mat_boxes[2]]
		probs_filtered = probs[filter_mat_probs]
		
		classes_num_filtered = np.argmax(
			filter_mat_probs,axis = 3)[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]]
			
		argsort = np.array(np.argsort(probs_filtered))[::-1]
		boxes_filtered = boxes_filtered[argsort] #筛选出box
		probs_filtered = probs_filtered[argsort] #选出probs
		classes_num_filtered = classes_num_filtered[argsort] #选出类别
		
		#过滤重叠度太高的box
		for i in range(len(boxes_filtered)):
			if probs_filtered[i] == 0:
				continue
			for j in range(i + 1,len(boxes_filtered)):
				if self.iou(boxes_filtered[i],boxes_filtered[j]) > self.iou_threshold:
					probs_filtered[j] = 0.0
					
		filter_iou = np.array(probs_filtered > 0.0,dtype = 'bool')
		boxes_filtered = boxes_filtered[filter_iou]
		probs_filtered = probs_filtered[filter_iou]
		classes_num_filtered = classes_num_filtered[filter_iou]
		
		result = []
		for i in range(len(boxes_filtered)):
			result.append([self.classes[classes_num_filtered[i]],
				boxes_filtered[i][0],
				boxes_filtered[i][1],
				boxes_filtered[i][2],
				boxes_filtered[i][3],
				probs_filtered[i]])
				
		return result
		
	#计算iou
	def iou(self,box1,box2):
		tb = min(box1[0] + 0.5 * box1[2],box2[0] + 0.5 * box2[2]) - \
			max(box1[0] - 0.5 * box1[2],box2[0] - 0.5 * box2[2]) #重叠区域的宽
		
		lr = min(box1[1] + 0.5 * box1[3],box2[1] + 0.5 * box2[3]) - \
			max(box1[1] - 0.5 * box1[3],box2[1] - 0.5 * box2[3]) #重叠区域的高
		
		inter = 0 if tb < 0 or lr < 0 else tb * lr #求两个box的交集
		return inter / (box1[2] * box1[3] + box2[2] * box2[3] - inter) #交集/并集
		
	#从摄像头读取数据
	def camera_detector(self,cap,wait = 10):
		detect_timer = Timer()
		ret,_ = cap.read()
		
		while ret:
			ret,frame = cap.read()
			detect_timer.tic()
			result = self.detect(frame)
			detect_timer.toc()
			print('Average detecting time: {:.3f}s'.format(      #统计处理一帧数据的平均检测时间
                detect_timer.average_time))
			
			self.draw_result(frame,result)
			cv2.imshow('camera',frame)
			cv2.waiKey(wait)
			
			ret,frame = cap.read()
			
	#读取图片数据
	def image_detector(self,imname,wait = 0):
		detect_timer = Timer()
		image = cv2.imread(imname)
		
		detect_timer.tic()
		result = self.detect(image)
		detect_timer.toc()
		
		print('Average detecting time: {:.3f}s'.format(      #统计平均检测时间
                detect_timer.average_time))
		
		self.draw_result(image,result)
		cv2.imshow('Image',image)
		cv2.waitKey(wait)

		
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--weights',default = 'YOLO_small.ckpt',type = str)
	parser.add_argument('--weight_dir',default = 'weights',type = str)
	parser.add_argument('--data_dir',default = 'data',type = str)
	parser.add_argument('--gpu',default = '',type = str)
	args = parser.parse_args()
	
	os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
	
	yolo = YOLONet(False)
	weight_file = os.path.join(args.data_dir,args.weight_dir,args.weights)#权重路径
	detector = Detector(yolo,weight_file)
	
	imname = 'test/person.jpg'
	detector.image_detector(imname)


	
if __name__ == '__main__':
	main()
	

train.py是网络训练程序

# -*- coding: utf-8 -*-
"""
Created on Sat Aug  4 17:35:45 2018

@author: Administrator
"""
import os
import argparse
import datetime
import numpy as np
import tensorflow as tf
import yolo.config as cfg
from yolo.yolo_net import YOLONet
from utils.timer import Timer
from utils.pascal_voc import pascal_voc

slim = tf.contrib.slim

class Solver(object):
	def __init__(self,net,data):
		self.net = net
		self.data = data
		self.weights_file = cfg.WIEGHTS_FILE
		self.max_iter = cfg.MAX_ITER
		self.initial_learning_rate = cfg.LEARNING_RATE
		self.decay_steps = cfg.DECAY_RATE
		self.decay_rate = cfg.DECAY_RATE
		self.staircase = cfg.STAIRCASE
		self.save_iter = cfg.SAVE_ITER
		self.output_dir = os.path.join(
			cfg.OUTPUT_DIR,datetime.datetime.now().strftime('%Y_%m_%d_%H_%M'))
		
		if not os.path.exists(self.output_dir):
			os.makedirs(self.output_dir)
		self.save_cfg()
		
		self.variable_to_restore = tf.global_variables()
		self.saver = tf.train.Saver(self.variable_to_restore,max_to_keep = None)

下面是程序运行的效果图

注:该程序非本人所写,基本是对别人的代码进行了一遍复现与讲解,若有不当之处,请指教,谢谢!

csdn(包含模型文件)代码下载链接:https://download.youkuaiyun.com/download/yongjiankuang/10627481

github(不包含模型文件):https://github.com/yongjiankuang/yolo

<think>嗯,用户想了解YOLO网络的代码结构分析,特别是深度学习方面的内容。首先,我需要回忆一下YOLO的不同版本,比如v3、v11,以及它们的结构变化。根据提供的引用,用户提到了Darknet-53、C3K2模块、C2PSA模块,还有YOLOv10的head思想。这说明用户可能对YOLO的演进和各个模块的具体实现感兴趣。 用户是小白,可能对代码结构不太熟悉,所以需要从基础开始解释。比如,先介绍YOLO整体的代码目录结构,这样用户能有一个全局的认识。然后分模块讲解,比如Backbone、Neck、Head,每个部分的功能和实现。结合引用中的内容,比如YOLOv3的Darknet-53和YOLOv11的改进模块,需要详细说明这些模块的结构和代码实现。 另外,用户提供的引用里提到了Tensorflow实现YOLOv3和YOLOv11的变化,比如将CF2改成C3K2,添加C2PSA模块,以及使用深度可分离卷积。这些都需要在回答中具体展开,可能需要用代码示例来展示这些模块是如何构建的。比如,Darknet-53中的残差块,或者C3K2模块的代码结构。 还要注意引用文献的标注,确保每个提到的改进或模块都有对应的引用标识。比如,提到Darknet-53时引用[^1],提到YOLOv11的结构变化时引用[^2][^3]。同时,回答的结构要清晰,分点讲解,可能包括代码目录、核心模块、版本演进、结构对比等部分,帮助用户逐步理解。 用户可能对代码的具体实现细节有疑问,比如如何定义卷积层,残差块的结构,或者如何实现SPPF和C2PSA模块。因此,在回答中需要提供相应的代码片段,并解释关键参数和设计思路。例如,YOLOv3中的残差块使用1x1和3x3卷积的组合,而YOLOv11的C3K2可能改变了卷积核大小或连接方式。 此外,用户可能想知道如何分析代码结构,比如从项目目录入手,查看模型定义、工具函数、配置文件等。这需要引导用户先了解整体项目布局,再深入各个模块。比如,引用[3]中提到项目目录的结构,可以据此展开说明。 最后,生成的相关问题需要围绕用户可能进一步感兴趣的点,比如不同版本的区别、模块的作用、训练流程或部署优化等。这样可以帮助用户深入学习和解决后续可能遇到的问题。</think>根据引用文献中关于YOLO网络结构的描述,我将从代码目录结构、核心模块实现和版本演进三方面为您解析YOLO网络的代码结构设计(以Tensorflow实现为例): ### 一、代码目录结构分析 ```bash yolo_project/ ├── configs/ # 网络参数配置文件 ├── core/ # 核心实现 │ ├── backbone.py # Darknet等骨干网络 │ ├── neck.py # FPN/PANet特征融合 │ ├── head.py # 检测头模块 │ └── losses.py # 损失函数 ├── models/ # 模型定义入口 ├── utils/ # 工具函数 │ ├── augment.py # 数据增强 │ └── metrics.py # 评估指标 └── train.py # 训练主程序 ``` ### 二、核心模块代码实现 1. **Backbone结构示例(Darknet-53)** ```python # backbone.py class DarknetConv(tf.keras.layers.Layer): def __init__(self, filters, kernel_size, strides=1): super().__init__() self.conv = tf.keras.layers.Conv2D(filters, kernel_size, strides, padding='same', use_bias=False) self.bn = tf.keras.layers.BatchNormalization() self.leaky = tf.keras.layers.LeakyReLU(alpha=0.1) def call(self, x): return self.leaky(self.bn(self.conv(x))) class ResidualBlock(tf.keras.layer.Layer): # 残差模块 def __init__(self, filters): super().__init__() self.conv1 = DarknetConv(filters//2, 1) self.conv2 = DarknetConv(filters, 3) def call(self, x): residual = x x = self.conv1(x) x = self.conv2(x) return x + residual # 跳跃连接实现 ``` 2. **YOLOv11改进模块(C2PSA)** ```python # neck.py class C2PSA(tf.keras.layers.Layer): # 通道空间注意力模块 def __init__(self, filters): super().__init__() self.conv = DarknetConv(filters, 3) self.avg_pool = tf.keras.layers.GlobalAveragePooling2D() self.fc = tf.keras.Sequential([ tf.keras.layers.Dense(filters//16, activation='relu'), tf.keras.layers.Dense(filters, activation='sigmoid') ]) def call(self, x): b, h, w, c = x.shape # 通道注意力 channel_att = self.fc(self.avg_pool(x)) # 空间注意力 spatial_att = tf.reduce_mean(x, axis=-1, keepdims=True) return x * channel_att[:, None, None, :] * spatial_att # 双重注意力机制 ``` ### 三、版本演进对比 | 版本 | 骨干网络 | 特征融合 | 检测头设计 | |------|---------|---------|-----------| | v3 | Darknet-53 | FPN | 3尺度预测 | | v8 | CSPDarknet | PANet | 解耦头设计 | | v11 | C3K2模块 | C2PSA增强 | 深度可分离卷积[^2] | ### 四、网络结构优化特点 1. **效率提升**:YOLOv11通过深度可分离卷积减少75%参数量 2. **注意力机制**:C2PSA模块增强特征表达能力 3. **特征融合优化**:将CF2模块改为C3K2,采用更大感受野的卷积核 4. **部署友好**:采用ONNX标准化输出格式
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值