如果对Tensorflow实现最新的Yolo v7算法感兴趣的朋友,可见我最新的文章:Yolo v7的最简TensorFlow实现_gzroy的博客-优快云博客
一直对物体检测算法很感兴趣,记录一下自己用Tensorflow来重现YOLO论文的过程。这里的程序代码都是个人原创。
我认为YOLO论文是一个开创性的研究成果,通过对图像划分多个网格,并对网格的物体进行预测,在速度上大大领先其他的物体检测算法,例如Regional CNN,Faster RCNN等。YOLO的论文写的不是太好读懂,特别是关于LOSS函数的计算这一部分。网上对于YOLO论文的解读有很多,个人认为以下这个是解释的最好的,里面的很多细节也解释的很清楚,推荐大家读一下。
以下是我的实现YOLO的全过程,分为几大部分
1. YOLO模型的搭建
首先是按照论文描述的网络结构来搭建模型。
其中的前20个卷积层采用Imagenet的数据来预训练,提取图像的特征。之后再增加4个卷积层和2个全连接层来做物体检测。但是在实际的模型中我做了一点修改,主要是把最后的两个全连接层取消了,而是用一个3×3,filter是30的卷积层来替代,这个卷积层的输出就是一个7×7×30的向量。这个修改主要是因为我按照论文的模型来进行训练时,发现很难收敛,而且这两个全连接层的参数量太大了,在我的GTX 1070 Ti 8G的显卡里面没法按照论文提到的一个Batch 64来进行训练,只能用Batch 24来训练。在Yolo V2论文中也把最后的全连接层改为卷积层了。
以下的yolonet_model.py的代码是构建YOLO的模型,注意在对Imagenet进行预训练时,Inference的参数pretrain_trainable和pretrain_training要设置为True,yolo_training设置为False。当预训练完成后,进行目标检测训练时,要把pretrain_trainable和pretrain_training要设置为False,yolo_training设置为True:
import tensorflow as tf
def _conv(name, inputs, kernel_size, in_channels, out_channels, stride, padding, trainable, bias_init, training):
with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
kernel = tf.get_variable(shape=[kernel_size,kernel_size,in_channels,out_channels], initializer=tf.contrib.layers.variance_scaling_initializer(factor=2.0,mode='FAN_IN',uniform=False), trainable=trainable, name='weights')
conv = tf.nn.conv2d(inputs, kernel, [1,stride,stride,1], padding=padding)
biases = tf.get_variable(initializer=tf.constant(bias_init, shape=[out_channels], dtype=tf.float32), trainable=trainable, name='biases')
bias = tf.nn.bias_add(conv, biases)
output = tf.nn.leaky_relu(bias, alpha=0.1, name=name)
output_bn = tf.layers.batch_normalization(output, axis=3, name='bn', trainable=trainable, training=training, reuse=tf.AUTO_REUSE)
return output_bn
def inference(images, pretrain_trainable=True, wd=0.0005, pretrain_training=True, yolo_training=True):
conv1 = _conv('conv1', images, 7, 3, 64, 2, 'SAME', pretrain_trainable, 0.01, pretrain_training) #112*112*64
pool1 = tf.nn.max_pool(conv1, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool1') #56*56*64
conv2 = _conv('conv2', pool1, 3, 64, 192, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #56*56*192
pool2 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool2') #28*28*192
conv3 = _conv('conv3', pool2, 1, 192, 128, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #28*28*128
conv4 = _conv('conv4', conv3, 3, 128, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #28*28*256
conv5 = _conv('conv5', conv4, 1, 256, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #28*28*256
conv6 = _conv('conv6', conv5, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #28*28*512
pool6 = tf.nn.max_pool(conv6, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool6') #14*14*512
conv7 = _conv('conv7', pool6, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*256
conv8 = _conv('conv8', conv7, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*512
conv9 = _conv('conv9', conv8, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*256
conv10 = _conv('conv10', conv9, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*512
conv11 = _conv('conv11', conv10, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*256
conv12 = _conv('conv12', conv11, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*512
conv13 = _conv('conv13', conv12, 1, 512, 256, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*256
conv14 = _conv('conv14', conv13, 3, 256, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*512
conv15 = _conv('conv15', conv14, 1, 512, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*512
conv16 = _conv('conv16', conv15, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #14*14*1024
pool16 = tf.nn.max_pool(conv16, ksize=[1,2,2,1], strides=[1,2,2,1], padding='VALID', name='pool16') #7*7*1024
conv17 = _conv('conv17', pool16, 1, 1024, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*512
conv18 = _conv('conv18', conv17, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*1024
conv19 = _conv('conv19', conv18, 1, 1024, 512, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*512
conv20 = _conv('conv20', conv19, 3, 512, 1024, 1, 'SAME', pretrain_trainable, 0.01, pretrain_training) #7*7*1024
#For YOLO training, add below 4 conv layers and 2 full connect layers.
#Remember to set the above pretrained 20 conv layers to non trainable.
if not pretrain_training:
new_conv21 = _conv('new_conv21', conv20, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #14*14*1024
new_conv22 = _conv('new_conv22', new_conv21, 3, 1024, 1024, 2, 'SAME', True, 0.01, yolo_training) #7*7*1024
new_conv23 = _conv('new_conv23', new_conv22, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #7*7*1024
new_conv24 = _conv('new_conv24', new_conv23, 3, 1024, 1024, 1, 'SAME', True, 0.01, yolo_training) #7*7*1024
new_conv25 = _conv('new_conv25', new_conv24, 3, 1024, 30, 1, 'SAME', True, 0.01, yolo_training) #7*7*30
return new_conv25
#For Imagenet pretrain
else:
avg_layer = tf.reduce_mean(conv20, axis=[1,2], keepdims=True) #1024
flatten = tf.layers.flatten(inputs=avg_layer, name='flatten')
with tf.variable_scope('local', reuse=tf.AUTO_REUSE):
weights = tf.get_variable(initializer=tf.truncated_normal([1024,1000], dtype=tf.float32, stddev=1/(1000)), trainable=True, name='weights')
weight_decay = tf.multiply(tf.nn.l2_loss(weights), wd, name='weight_loss')
tf.add_to_collection('losses', weight_decay)
biases = tf.get_variable(initializer=tf.constant(1.0, shape=[1000], dtype=tf.float32), trainable=True, name='biases')
local = tf.nn.xw_plus_b(flatten, weights, biases, name='local')
return local
2. 对Imagenet的1000个分类数据进行预训练
具体的训练过程和我之前博客提到的类似,这里就不再重复了。
3. 准备目标检测的训练数据
论文中提到了是采用PASCAL VOC 2012的train/val数据加上2007的train/val/test的数据来进行训练的。因此我这里也采用同样的数据来进行训练。在PASCAL的官网上可以下载相关的数据。下载之后可以看到,在JPEGImages目录下存放了图片,在Annotations目录下面存放了图片对应的物体检测框BBOX的数据。因此我们需要首先把Annotation里面的BBOX提取出来。用以下的代码来进行提取后,保存为一个csv文件:
import xml.etree.ElementTree as ET
import os
xmlRootDir_train = 'VOC2012_train/Annotations/'
files_train = os.listdir(xmlRootDir_train)
def parseXML(filename):
bbox = [[],[],[],[],[]]
tree = ET.parse(filename)
root = tree.getroot()
size = root.find('size')
width = float(size.find('width').text)
height = float(size.find('height').text)
for node in root.iter("object"):
bndbox = node.find('bndbox')
classname = node.find('name').text
try:
xmin = int(bndbox.find('xmin').text)
ymin = int(bndbox.find('ymin').text)
xmax = int(bndbox.find('xmax').text)
ymax = int(bndbox.find('ymax').text)
bbox[0].append(classname)
bbox[1].append(xmin)
bbox[2].append(ymin)
bbox[3].append(xmax)
bbox[4].append(ymax)
except:
print(filename)
return bbox
bboxfile = open('bbox_train.csv', 'w')
content = ''
for xmlfile in files_train:
bbox = parseXML(xmlRootDir_train+xmlfile)
content += xmlfile
for j in range(5):
content += ','+';'.join([str(x) for x in bbox[j]])
content += '\n'
bboxfile.writelines(content)
bboxfile.close()
BBOX的数据提取完成后,我们就可以把图片数据以及BBOX数据都保存在TFRECORD文件中,来方便之后的训练。在生成训练数据时,我把图片都统一缩放保存为538×538像素,这是因为在训练时要随机裁剪为448×448尺寸,因此放大20%。以下代码是把图片以及BBOX保存为多个TFRECORD文件,最终生成的TFRECORD文件共包含26332张图片。
#-*- encoding: utf-8 -*-
import tensorflow as tf
import cv2
import numpy as np
import os
from multiprocessing import Process, Queue
import sys
import time
import random
import math
max_num = 1400 #max record number in one file
train_path = 'VOC2012_train/JPEGImages/' #the folder stroes the train images
cores = 10 #number of CPU cores to process
resize_width = 538 #448*1.2
resize_height = 538 #448*1.2
grids = 7
#VOC2012_train图片共分为20个类别,构建一个字典,Key是类名,value是0-19
labels_dict = {'person':0, 'bird':1, 'cat':2, 'cow':3, 'dog':4, 'horse':5, 'sheep':6, 'aeroplane':7, 'bicycle':8,
'boat':9, 'bus':10, 'car':11, 'motorbike':12, 'train':13, 'bottle':14, 'chair':15, 'diningtable':16,
'pottedplant':17, 'sofa':18, 'tvmonitor':19}
#读取bbox文件
bbox_list = {}
with open('bbox_train.csv', 'r') as bboxfile:
records = bboxfile.readlines()
for record in records:
fields = record.strip().split(',')
filename = fields[0][:-4]
labels = [labels_dict[x] for x in fields[1].split(';')]
xmin = [float(x) for x in fields[2].split(';')]
ymin = [float(x) for x in fields[3].split(';')]
xmax = [float(x) for x in fields[4].split(';')]
ymax = [float(x) for x in fields[5].split(';')]
bbox_list[filename] = [labels, xmin, ymin, xmax, ymax]
files = bbox_list.keys()
#构建训练集文件列表,里面的每个元素是路径名+图片文件名
train_images_filenames = os.listdir(train_path)
train_images = []
for image_file in train_images_filenames:
if image_file[:-4] in files:
train_images.append(train_path+','+image_file)
random.shuffle(train_images)
#把图像数据和标签转换为TRRECORD的格式
def make_example(image, height, width, label, bbox, filename):
colorspace = b'RGB'
channels = 3
img_format = b'JPEG'
return tf.train.Example(features=tf.train.Features(feature={
'image' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
'height' : tf.train.Feature(int64_list=tf.train.Int64List(value=[height])),
'width' : tf.train.Feature(int64_list=tf.train.Int64List(value=[width])),
'channels' : tf.train.Feature(int64_list=tf.train.Int64List(value=[channels])),
'colorspace' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[colorspace])),
'img_format' : tf.train.Feature(bytes_list=tf.train.BytesList(value=[img_format])),
'label' : tf.train.Feature(int64_list=tf.train.Int64List(value=label)),
'bbox_xmin' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[0])),
'bbox_xmax' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[2])),
'bbox_ymin' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[1])),
'bbox_ymax' : tf.train.Feature(int64_list=tf.train.Int64List(value=bbox[3])),
'filename': tf.train.Feature(bytes_list=tf.train.BytesList(value=[filename]))
}))
#这个函数用来生成TFRECORD文件,第一个参数是列表,每个元素是图片文件名,第二个参数是写入的目录名
#第三个参数是文件名的起始序号,第四个参数是队列名称,用于和父进程发送消息
def gen_tfrecord(trainrecords, targetfolder, startnum, queue):
tfrecords_file_num = startnum
file_num = 0
total_num = len(trainrecords)
pid = os.getpid()
queue.put((pid, file_num))
writer = tf.python_io.TFRecordWriter(targetfolder+"train_"+str(tfrecords_file_num)+".tfrecord")
for record in trainrecords:
file_num += 1
fields = record.split(',')
img_raw = cv2.imread(fields[0]+fields[1])
height, width, _ = img_raw.shape
img = cv2.resize(img_raw, (resize_width, resize_height))
height_ratio = resize_height/height
width_ratio = resize_width/width
img_jpg = cv2.imencode('.jpg', img)[1].tobytes()
bbox = bbox_list[fields[1][:-4]]
bbox[1] = [int(item*width_ratio) for item in bbox[1]] #xmin
bbox[3] = [int(item*width_ratio) for item in bbox[3]] #xmax
bbox[2] = [int(item*height_ratio) for item in bbox[2]] #ymin
bbox[4] = [int(item*height_ratio) for item in bbox[4]] #ymax
label = bbox[0]
ex = make_example(img_jpg, resize_height, resize_width, label, bbox[1:], fields[1].encode())
writer.write(ex.SerializeToString())
#每写入100条记录,向父进程发送消息,报告进度
if file_num%100==0:
queue.put((pid, file_num))
if file_num%max_num==0:
writer.close()
tfrecords_file_num += 1
writer = tf.python_io.TFRecordWriter(targetfolder+"train_"+str(tfrecords_file_num)+".tfrecord")
writer.close()
queue.put((pid, file_num))
#这个函数用来多进程生成TFRECORD文件,第一个参数是要处理的图片的文件名列表,第二个参数是需要用的CPU核心数
#第三个参数写入的文件目录名
def process_in_queues(fileslist, cores, targetfolder):
total_files_num = len(fileslist)
each_process_files_num = int(total_files_num/cores)
files_for_process_list = []
for i in range(cores-1):
files_for_process_list.append(fileslist[i*each_process_files_num:(i+1)*each_process_files_num])
files_for_process_list.append(fileslist[(cores-1)*each_process_files_num:])
files_number_list = [len(l) for l in files_for_process_list]
each_process_tffiles_num = math.ceil(each_process_files_num/max_num)
queues_list = []
processes_list = []
for i in range(cores):
queues_list.append(Queue())
#queue = Queue()
processes_list.append(Process(target=gen_tfrecord,
args=(files_for_process_list[i],targetfolder,
each_process_tffiles_num*i+1,queues_list[i],)))
for p in processes_list:
Process.start(p)
#父进程循环查询队列的消息,并且每0.5秒更新一次
while(True):
try:
total = 0
progress_str=''
for i in range(cores):
msg=queues_list[i].get()
total += msg[1]
progress_str+='PID'+str(msg[0])+':'+str(msg[1])+'/'+ str(files_number_list[i])+'|'
progress_str+='\r'
print(progress_str, end='')
if total == total_files_num:
for p in processes_list:
p.terminate()
p.join()
break
time.sleep(0.5)
except:
break
return total
if __name__ == '__main__':
print('Start processing train data using %i CPU cores:'%cores)
starttime=time.time()
total_processed = process_in_queues(train_images, cores, targetfolder='train_tf/')
endtime=time.time()
print('\nProcess finish, total process %i images in %i seconds'%(total_processed, int(endtime-starttime)), end='')
4. 目标检测的训练
现在可以正式开始训练了,代码如下:
import tensorflow as tf
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
from tensorflow import contrib
autograph = contrib.autograph
import yolonet_model
import time
image_width = 448
image_height = 448
image_width_delta = 90 #448*1.2-448
image_height_delta = 90 #448*1.2-448
batch_size = 64
valid_batch_size = 1
epoch_size = 26332
grids=7
lambda_noobj = 0.5
lambda_obj = 5.0
grid_width = image_width//grids
grid_height = image_height//grids
labels = ['person','bird','cat','cow','dog','horse','sheep','aeroplane','bicycle',
'boat','bus','car','motorbike','train','bottle','chair','diningtable',
'pottedplant','sofa','tvmonitor']
#Define the function to parse the TFRECORD
def _parse_function(example_proto):
features = {"image": tf.FixedLenFeature([], tf.string, default_value=""),
"height": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"width": tf.FixedLenFeature([1], tf.int64, default_value=[0]),
"channels": tf.FixedLenFeature([1], tf.int64, default_value=[3]),
"colorspace": tf.FixedLenFeature([], tf.string, default_value=""),
"img_format": tf.FixedLenFeature([], tf.string, default_value=""),
"label": tf.VarLenFeature(tf.int64),
"bbox_xmin": tf.VarLenFeature(tf.int64),
"bbox_xmax": tf.VarLenFeature(tf.int64),
"bbox_ymin": tf.VarLenFeature(tf.int64),
"bbox_ymax": tf.VarLenFeature(tf.int64),
"filename": tf.FixedLenFeature([], tf.string, default_value="")
}
parsed_features = tf.parse_single_example(example_proto, features)
label = tf.expand_dims(parsed_features["label"].values, 0)
label = tf.cast(label, tf.float32)
height = parsed_features["height"]
width = parsed_features["width"]
channels = parsed_features["channels"]
#Generate the random crop offset
random_width_start = tf.random.uniform([1], minval=0, maxval=image_width_delta, dtype=tf.dtypes.int64)
random_height_start = tf.random.uniform([1], minval=0, maxval=image_height_delta, dtype=tf.dtypes.int64)
random_start = tf.concat([random_height_start, random_width_start, tf.constant([0], dtype=tf.dtypes.int64)], axis=0)
#Adjust the bbox coordinates with random crop offset
def f1():
xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
xmin = xmin - random_width_start
xmin = tf.clip_by_value(xmin, 0, image_width)
xmax = xmax - random_width_start
xmax = tf.clip_by_value(xmax, 0, image_width)
ymin = ymin - random_height_start
ymin = tf.clip_by_value(ymin, 0, image_height)
ymax = ymax - random_height_start
ymax = tf.clip_by_value(ymax, 0, image_height)
return xmin, xmax, ymin, ymax
#Adjust the bbox coordinates with image flipped and random crop offset
def f2():
xmin = tf.expand_dims(parsed_features["bbox_xmin"].values, 0)
xmax = tf.expand_dims(parsed_features["bbox_xmax"].values, 0)
ymin = tf.expand_dims(parsed_features["bbox_ymin"].values, 0)
ymax = tf.expand_dims(parsed_features["bbox_ymax"].values, 0)
xmin_temp = xmin - random_width_start
xmax_temp = xmax - random_width_start
xmin = image_width - tf.clip_by_value(xmax_temp, 0, image_width)
xmax = image_width - tf.clip_by_value(xmin_temp, 0, image_width)
ymin = ymin - random_height_start
ymin = tf.clip_by_value(ymin, 0, image_height)
ymax = ymax - random_height_start
ymax = tf.clip_by_value(ymax, 0, image_height)
return xmin, xmax, ymin, ymax
#Generate the random flip flag
random_flip = tf.random.uniform([1], minval=0, maxval=1, dtype=tf.dtypes.float32)
#Get the random flip and crop image coordinates
xmin, xmax, ymin, ymax = tf.cond(tf.less(random_flip[0], 0.5), f1, f2)
image_raw = tf.image.decode_jpeg(parsed_features["image"], channels=3)
image_sliced = tf.slice(image_raw, random_start, [image_height, image_width, -1])
image_decoded = tf.image.convert_image_dtype(image_sliced, tf.float32)
image_flipped = tf.cond(tf.less(random_flip[0], 0.5), lambda:image_decoded, lambda:tf.image.flip_left_right(image_decoded))
image_train = tf.image.per_image_standardization(image_flipped)
#Calculate the boxes center point
box_center_x = xmin+(xmax-xmin)//2
box_center_y = ymin+(ymax-ymin)//2
#Calculate the boxes relate to which grid
grid_id = (tf.ceil(box_center_y/grid_height)-1)*grids + tf.ceil(box_center_x/grid_width) - 1
grid_id = tf.cast(grid_id, tf.float32)
#Calculate and normalize the bbox center and width by grids
center_x_percent = box_center_x%grid_width/grid_width
center_x_percent = tf.cast(center_x_percent, tf.float32)
center_y_percent = box_center_y%grid_height/grid_height
center_y_percent = tf.cast(center_y_percent, tf.float32)
box_width = (xmax-xmin)/image_width
box_width = tf.cast(box_width, tf.float32)
box_height = (ymax-ymin)/image_height
box_height = tf.cast(box_height, tf.float32)
#Generate the new bbox vector for label
bbox = tf.concat(axis=0, values=[grid_id, center_x_percent, center_y_percent, box_width, box_height, label])
bbox = tf.transpose(bbox, [1, 0])
return image_train, parsed_features["filename"], image_raw, bbox
#Construct the train dataset
with tf.device('/cpu:0'):
train_files = tf.data.Dataset.list_files("train_tf/*.tfrecord")
dataset_train = train_files.interleave(tf.data.TFRecordDataset, cycle_length=4, num_parallel_calls=4)
dataset_train = dataset_train.shuffle(buffer_size=epoch_size)
dataset_train = dataset_train.repeat(100)
dataset_train = dataset_train.map(_parse_function, num_parallel_calls=12)
dataset_train = dataset_train.padded_batch(batch_size, \
padded_shapes=([None,None,None], [], \
[None,None,None], [None,None]))
dataset_train = dataset_train.prefetch(batch_size)
iterator = tf.data.Iterator.from_structure(dataset_train.output_types, dataset_train.output_shapes)
image_train, filename, image_decoded, bbox = iterator.get_next()
train_init_op = iterator.make_initializer(dataset_train)
#Calculate the IOU, the merged is two rect vectors combined
#Each rect vecor has 4 elements, xmin, xmax, ymin and ymax
def calculate_IOU(merged):
rect1 = merged[:4]
rect2 = merged[4:8]
IOU = 0.0
IOU_area = 0.0
xmin=0.0
xmax=0.0
ymin=0.0
ymax= 0.0
rect1_area=0.0
rect2_area= 0.0
if (rect1[0]>=rect2[1] or rect2[0]>=rect1[1] or rect1[2]>=rect2[3] or rect2[2]>=rect1[3]):
IOU = 0.0
else:
xmin=tf.maximum(rect1[0], rect3[0])
xmax=tf.minimum(rect1[1], rect3[1])
ymin=tf.maximum(rect1[2], rect3[2])
ymax=tf.minimum(rect1[3], rect3[3])
IOU_area=(xmax-xmin)*(ymax-ymin)
rect1_area=(rect1[1]-rect1[0])*(rect1[3]-rect1[2])
rect3_area=(rect3[1]-rect3[0])*(rect3[3]-rect3[2])
IOU=IOU_area / (rect1_area+rect3_area-IOU_area)
return IOU
tf_calculate_IOU = autograph.to_graph(calculate_IOU)
#Calculate the loss. Based on the YOLO V1 paper
#The pred is the prediction vector with shape [batchsize*7*7,30].
#The label is the vector with shape [batchsize*7*7,26].
def loss_func(pred, label):
#Divide the pred and label vectors by the mask for with object or non object.
mask_obj = label[:,1]>0.0
mask_noobj = label[:,1]<1.0
pred_obj = tf.boolean_mask(pred, mask_obj)
label_obj = tf.boolean_mask(label, mask_obj)
pred_noobj = tf.boolean_mask(pred, mask_noobj)
label_noobj = tf.boolean_mask(label, mask_noobj)
#Calculate the no obj prediction error
loss_noobj = tf.reduce_sum(tf.square(pred_noobj[:,0])+tf.square(pred_noobj[:,5]))
loss_classes = tf.reduce_sum(tf.square(pred_obj[:,10:]-label_obj[:,6:]))
#Calculate the prediction box coordinates
center_x1 = pred_obj[:,1:2]*grid_width
center_y1 = pred_obj[:,2:3]*grid_height
xmin_1 = center_x1 - pred_obj[:,3:4]**2*image_width//2
xmax_1 = center_x1 + pred_obj[:,3:4]**2*image_width//2
ymin_1 = center_y1 - pred_obj[:,4:5]**2*image_height//2
ymax_1 = center_y1 + pred_obj[:,4:5]**2*image_height//2
center_x2 = pred_obj[:,6:7]*grid_width
center_y2 = pred_obj[:,7:8]*grid_height
xmin_2 = center_x2 - pred_obj[:,8:9]**2*image_width//2
xmax_2 = center_x2 + pred_obj[:,8:9]**2*image_width//2
ymin_2 = center_y2 - pred_obj[:,9:10]**2*image_height//2
ymax_2 = center_y2 + pred_obj[:,9:10]**2*image_height//2
#Calculate the label box coordinates
center_x = label_obj[:,2:3]*grid_width
center_y = label_obj[:,3:4]*grid_height
xmin = center_x - label_obj[:,4:5]*image_width//2
xmax = center_x + label_obj[:,4:5]*image_width//2
ymin = center_y - label_obj[:,5:6]*image_height//2
ymax = center_y + label_obj[:,5:6]*image_height//2
#Concat the prediction box and ground truth box and calculate the IOU
merged1 = tf.concat([xmin_1,xmax_1,ymin_1,ymax_1,xmin,xmax,ymin,ymax], -1)
merged2 = tf.concat([xmin_2,xmax_2,ymin_2,ymax_2,xmin,xmax,ymin,ymax], -1)
IOU1 = tf.map_fn(tf_calculate_IOU, merged1)
IOU2 = tf.map_fn(tf_calculate_IOU, merged2)
#Select the higher IOU prediction box for coordination loss calculation
IOU1_mask = tf.math.greater_equal(IOU1,IOU2)
IOU2_mask = tf.math.greater(IOU2,IOU1)
coord_IOU1 = tf.boolean_mask(pred_obj[:,:5], IOU1_mask)
label_IOU1 = tf.boolean_mask(label_obj[:,2:6], IOU1_mask)
coord_IOU2 = tf.boolean_mask(pred_obj[:,5:10], IOU2_mask)
label_IOU2 = tf.boolean_mask(label_obj[:,2:6], IOU2_mask)
loss_coord = lambda_obj * (tf.reduce_sum( \
tf.square(coord_IOU1[:,1]-label_IOU1[:,0]) + \
tf.square(coord_IOU1[:,2]-label_IOU1[:,1]) + \
tf.square(coord_IOU1[:,3]-tf.sqrt(label_IOU1[:,2])) + \
tf.square(coord_IOU1[:,4]-tf.sqrt(label_IOU1[:,3])))+ \
tf.reduce_sum( \
tf.square(coord_IOU2[:,1]-label_IOU2[:,0]) + \
tf.square(coord_IOU2[:,2]-label_IOU2[:,1]) + \
tf.square(coord_IOU2[:,3]-tf.sqrt(label_IOU2[:,2])) + \
tf.square(coord_IOU2[:,4]-tf.sqrt(label_IOU2[:,3]))))
#Calculate the confidence for these two prediction boxes
loss_confidence = tf.reduce_sum(tf.square(pred_obj[:,0]-IOU1)+tf.square(pred_obj[:,5]-IOU2))
#Sum up all the loss parts
loss = (loss_noobj+loss_classes+loss_coord+loss_confidence)/batch_size
return loss
image_train_batch = tf.placeholder(shape=[None, image_height, image_width, 3], dtype=tf.float32)
grids_vector_batch = tf.placeholder(shape=[None, grids*grids, 26], dtype=tf.float32)
result = yolonet_model.inference(image_train_batch, pretrain_trainable=False, wd=0.0005, pretrain_training=False, yolo_training=True)
result = tf.reshape(result, [-1,30])
grids_vector = tf.reshape(grids_vector_batch, [-1,26])
mse_loss = loss_func(result, grids_vector)
tf.add_to_collection('losses', mse_loss)
loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
global_step = tf.Variable(0, trainable=False)
epoch_steps = int(epoch_size/batch_size)
boundaries = [epoch_steps*5,epoch_steps*55,epoch_steps*75]
values = [0.001, 0.01, 0.001, 0.0001]
learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
with tf.control_dependencies(update_ops):
optimize_op = optimizer.minimize(loss, global_step=global_step)
#Load the pretrain Imagenet weights
#For the first time training, cancel the comment of below codes.
'''
variables_list = []
for var in tf.all_variables():
#The var with "new" means it's for object dection training, not included in Imagenet pretrain
if 'new' in var.name or 'Variable' in var.name:
continue
else:
variables_list.append(var)
saver=tf.train.Saver(variables_list)
'''
saver_yolo=tf.train.Saver()
with tf.Session() as sess:
#For first time training, cancel the comment of below codes.
'''
sess.run(tf.global_variables_initializer())
#Load the pretrained Imagenet weights
saver.restore(sess, "/home/roy/AI/model_bn_loss/model.ckpt-105000")
sess.run(global_step.initializer)
'''
#Comment out below line if the first training
saver_yolo.restore(sess, "model_yolo/model.ckpt-30000")
sess.run([train_init_op])
total_loss = 0.0
starttime = time.time()
while(True):
try:
images_run, bbox_run = sess.run([image_train, bbox])
#Construct the grids_vector based on bbox_run
batch_num, box_num, _ = bbox_run.shape
grids_vector_list = []
for i in range(batch_num):
vector = np.zeros([grids*grids, 26], dtype=float)
for j in range(box_num):
if bbox_run[i][j][3]==0.0 or bbox_run[i][j][4]==0.0:
continue
else:
grid_id = int(bbox_run[i][j][0])
vector[grid_id][0] = grid_id
vector[grid_id][1] = 1.0
vector[grid_id][2] = bbox_run[i][j][1]
vector[grid_id][3] = bbox_run[i][j][2]
vector[grid_id][4] = bbox_run[i][j][3]
vector[grid_id][5] = bbox_run[i][j][4]
label = int(bbox_run[i][j][5])
vector[grid_id][label+6] = 1.0
grids_vector_list.append(vector)
grids_vector_run = np.stack(grids_vector_list)
loss_a, step, lr, _ = sess.run([loss, global_step, learning_rate, optimize_op], feed_dict={image_train_batch:images_run, grids_vector_batch:grids_vector_run})
total_loss += loss_a
if step%100==0:
print("step: %i, Learning_rate:%f, Time: %is Loss: %f" \
%(step, lr, int(time.time()-starttime), total_loss/100))
total_loss = 0.0
starttime = time.time()
if step%2000==0:
save_path = saver_yolo.save(sess, "model_yolo/model.ckpt", global_step=global_step)
except tf.errors.OutOfRangeError:
break
5. 验证模型效果
当训练完成后,我们就可以检验一下模型的效果了,以下代码将读取之前训练好的模型,对图像进行检测,检测时,只选取Confidence值大于0.1的BBOX,最后再应用非最大化抑制方法来消除IOU大于0.2的BBOX:
import tensorflow as tf
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import yolonet_model
image_width = 448
image_height = 448
batch_size = 1
grids=7
grid_width = image_width//grids
grid_height = image_height//grids
labels = ['person','bird','cat','cow','dog','horse','sheep','aeroplane','bicycle',
'boat','bus','car','motorbike','train','bottle','chair','diningtable',
'pottedplant','sofa','tvmonitor']
category_colors = {'person':(33,36,41), 'bird':(105,128,112), 'cat':(201,230,252),
'cow':(31,102,156), 'dog':(240,32,160), 'horse':(214,112,118),
'sheep':(18,153,255), 'aeroplane':(0,215,255), 'bicycle':(3,97,255),
'boat':(42,42,128), 'bus':(143,143,188), 'car':(18,38,94),
'motorbike':(230,224,176), 'train':(205,90,106), 'bottle':(15,94,56),
'chair':(84,46,8), 'diningtable':(64,145,61), 'pottedplant':(50,205,50),
'sofa':(80,127,255), 'tvmonitor':(31,23,176)}
def _parse_img(imgname):
image_raw = tf.gfile.FastGFile(imgname,'rb').read()
img = tf.image.decode_jpeg(image_raw)
image_decoded = tf.image.convert_image_dtype(img, tf.float32)
img_resized = tf.image.resize(image_decoded, [image_height, image_width])
img_processed = tf.image.per_image_standardization(img_resized)
img_processed = tf.expand_dims(img_processed, 0)
img_resized = tf.expand_dims(img_resized, 0)
img_original = tf.expand_dims(img, 0)
return img_processed, img_original
#Calculate the IOU of two rect.
#Each rect is a list of [(xmin, ymin), (xmax, ymax)]
def IOU_calculate(rect1, rect2):
if (rect1[0][0]>=rect2[1][0] or rect2[0][0]>=rect1[1][0] or rect1[0][1]>=rect2[1][1] or rect2[0][1]>=rect1[1][1]):
#if (rect1[0]>=rect2[1] or rect2[0]>=rect1[1] or rect1[2]>=rect2[3] or rect2[2]>=rect1[3]):
IOU = 0.0
else:
xmin=max(rect1[0][0], rect2[0][0])
xmax=min(rect1[1][0], rect2[1][0])
ymin=max(rect1[0][1], rect2[0][1])
ymax=min(rect1[1][1], rect2[1][1])
IOU_area=(xmax-xmin)*(ymax-ymin)
rect1_area=(rect1[1][0]-rect1[0][0])*(rect1[1][1]-rect1[0][1])
rect2_area=(rect2[1][0]-rect2[0][0])*(rect2[1][1]-rect2[0][1])
IOU=IOU_area / (rect1_area+rect2_area-IOU_area)
if IOU<0.0:
IOU = 0.0
if IOU>1.0:
IOU = 1.0
return IOU
test_image = tf.placeholder(tf.float32, [1, image_height,image_width,3])
result = yolonet_model.inference(test_image, pretrain_trainable=False, wd=0.0005, pretrain_training=False, yolo_training=False)
result = tf.reshape(result, [49,30])
result=tf.clip_by_value(result, 0.0, 1.0)
grid_index = tf.reshape(tf.constant([i for i in range(49)]), [-1,1])
grid_index = tf.cast(grid_index, tf.float32)
grid_infer = tf.concat([grid_index, result], axis=1)
grid_class_index = tf.reshape(tf.argmax(grid_infer[:,11:], axis=1)+11, [-1,1])
grid_class_probability = tf.batch_gather(grid_infer, grid_class_index)
mask1=tf.math.greater_equal(grid_infer[:,1:2]*grid_class_probability, 0.1) #check if the predict IOU>0.1
mask2=tf.math.greater_equal(grid_infer[:,6:7]*grid_class_probability, 0.1) #check if the predict IOU>0.1
mask=tf.reshape(mask1|mask2, [-1])
grid_object = tf.boolean_mask(grid_infer, mask) #Only get the grids that predict IOU>0.1
grid_object_class = tf.argmax(grid_object[:,11:], axis=1) #Get the predict class
saver_yolo=tf.train.Saver()
img_processed, img = _parse_img("/home/roy/AI/darknet/darknet/data/dog.jpg")
with tf.Session() as sess:
saver_yolo.restore(sess, "model_yolo/model.ckpt-50000")
img1, image_run = sess.run([img_processed, img])
grid_object_run,grid_object_class_run,grid_infer_run,m1,m2,m,g_index_t,g_index_class_t = sess.run([grid_object,grid_object_class,grid_infer,mask1,mask2,mask,grid_class_index,grid_class_probability], feed_dict={test_image:img1})
original_height, original_width, _ = image_run[0].shape
height_ratio = original_height/image_height
width_ratio = original_width/image_width
box = []
category = []
confidence = []
probabilty = []
box_by_category = {}
for i in range(len(grid_object_run)):
coord_start = 2
if grid_object_run[i][1]<grid_object_run[i][6]:
coord_start = 7
center_x = grid_object_run[i][coord_start]*grid_width+grid_object_run[i][0]%grids*grid_width
center_y = grid_object_run[i][coord_start+1]*grid_height+grid_object_run[i][0]//grids*grid_height
xmin = int(max(int(center_x - grid_object_run[i][coord_start+2]**2*image_width//2), 0)*width_ratio)
xmax = int((center_x + grid_object_run[i][coord_start+2]**2*image_width//2)*width_ratio)
ymin = int(max(int(center_y - grid_object_run[i][coord_start+3]**2*image_height//2), 0)*height_ratio)
ymax = int((center_y + grid_object_run[i][coord_start+3]**2*image_height//2)*height_ratio)
box.append([(xmin,ymin),(xmax,ymax)])
category.append(labels[grid_object_class_run[i]])
confidence.append(grid_object_run[i][coord_start-1]*grid_object_run[i][grid_object_class_run[i]+11])
probabilty.append(grid_object_run[i][grid_object_class_run[i]+11])
for i in range(len(category)):
if category[i] in box_by_category:
box_by_category[category[i]].append((box[i],confidence[i],probabilty[i]))
else:
box_by_category[category[i]]=[(box[i],confidence[i],probabilty[i])]
image_copy = image_run.copy()
#Draw all the predicted bbox that confidence>0.1
fontFace = cv2.FONT_HERSHEY_COMPLEX
fontScale = 0.6
thickness = 1
lineType = 4
for i in range(len(box)):
cv2.rectangle(image_run[0], box[i][0], box[i][1], category_colors[category[i]],0)
cv2.putText(image_run[0], category[i], (box[i][0][0], box[i][0][1]-8), \
fontFace, fontScale, category_colors[category[i]], thickness, lineType)
plt.imshow(image_run[0])
#Use Non Max Suppression to remove some bbox
nmx_threshold=0.2
box_by_category_nmx = {}
for key in box_by_category:
nmx_box = sorted(box_by_category[key], key=lambda x:x[1], reverse=True)
start_index = 0
dropped_index = []
while True:
if start_index >= len(nmx_box):
break
if start_index in dropped_index:
start_index += 1
continue
rect1 = nmx_box[start_index][0] #The rect include [(xmin,ymin), (xmax,ymax)]
for i in range((start_index+1),len(nmx_box)):
if i not in dropped_index:
rect2 = nmx_box[i][0]
iou = IOU_calculate(rect1, rect2)
if iou > nmx_threshold:
dropped_index.append(i)
start_index += 1
box_by_category_nmx[key]=[]
for i in range(len(nmx_box)):
if i not in dropped_index:
box_by_category_nmx[key].append(nmx_box[i])
#Draw the bbox using Non max suppression
for key in box_by_category_nmx:
for i in range(len(box_by_category_nmx[key])):
cv2.rectangle(image_copy[0], box_by_category_nmx[key][i][0][0], box_by_category_nmx[key][i][0][1], category_colors[key],2)
probability = str(round(box_by_category_nmx[key][i][2]*100, 2))+"%"
cv2.putText(image_copy[0], key+" "+probability, \
(box_by_category_nmx[key][i][0][0][0], box_by_category_nmx[key][i][0][0][1]-8), \
fontFace, fontScale, category_colors[key], thickness, lineType)
#Save the picture
cv_image = cv2.cvtColor(image_copy[0], cv2.COLOR_BGR2RGB)
cv2.imwrite("predict_image/dog_yolo_50000.jpg", cv_image)
6. 检测效果
以下是一些图片的检测效果,还是能达到不错的效果的,和YOLO论文中的结果不相上下(除了最后一张图片中的白马被识别为了SHEEP,估计是之前预训练的图片中的SHEEP大多数是白色的,容易和白马搞混了吧 ^_^)