FCN源码解读之voc_layers.py

最新推荐文章于 2025-09-03 19:26:54 发布

转载最新推荐文章于 2025-09-03 19:26:54 发布 · 401 阅读

文章标签：

#FCN #caffe

Deep Learning 专栏收录该内容

14 篇文章

订阅专栏

本文深入解析了FCN中用于图像语义分割的VOCSegDataLayer和SBDDSegDataLayer数据层，详细介绍了这两个层的实现原理和代码解读，包括setup、reshape、forward和backward等关键函数的作用。

转载自 https://blog.youkuaiyun.com/qq_21368481/article/details/80246028

voc_layers.py是FCN中利用python写的数据层（即使用caffe的Python API 写的数据输入层），其格式是相对固定的，包含setup()、reshape()、forward()、backward()四个必要函数。

其源码如下：


   
     
      
     
     
      
       import caffe
      
     

     
      
     
     
       
      
     

     
      
     
     
      
       import numpy 
       as np
      
     

     
      
     
     
      
       from PIL 
       import Image
      
     

     
      
     
     
       
      
     

     
      
     
     
      
       import random
      
     

     
      
     
     
       
      
     

     
      
     
     
      
       class VOCSegDataLayer(caffe.Layer):
      
     

     
      
     
     
          
       """
      
     

     
      
     
     
      
           Load (input image, label image) pairs from PASCAL VOC
      
     

     
      
     
     
      
           one-at-a-time while reshaping the net to preserve dimensions.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
           Use this to feed data to a fully convolutional network.
      
     

     
      
     
     
      
           """
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def setup(self, bottom, top):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Setup data layer according to parameters:
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               - voc_dir: path to PASCAL VOC year dir
      
     

     
      
     
     
      
               - split: train / val / test
      
     

     
      
     
     
      
               - mean: tuple of mean values to subtract
      
     

     
      
     
     
      
               - randomize: load in random order (default: True)
      
     

     
      
     
     
      
               - seed: seed for randomization (default: None / current time)
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               for PASCAL VOC semantic segmentation.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               example
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               params = dict(voc_dir="/path/to/PASCAL/VOC2011",
      
     

     
      
     
     
      
                   mean=(104.00698793, 116.66876762, 122.67891434),
      
     

     
      
     
     
      
                   split="val")
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       # config
      
     

     
      
     
     
      
               params = eval(self.param_str)
      
     

     
      
     
     
      
               self.voc_dir = params[
       'voc_dir']
      
     

     
      
     
     
      
               self.split = params[
       'split']
      
     

     
      
     
     
      
               self.mean = np.array(params[
       'mean'])
      
     

     
      
     
     
      
               self.random = params.get(
       'randomize', 
       True)
      
     

     
      
     
     
      
               self.seed = params.get(
       'seed', 
       None)
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # two tops: data and label
      
     

     
      
     
     
              
       if len(top) != 
       2:
      
     

     
      
     
     
                  
       raise Exception(
       "Need to define two tops: data and label.")
      
     

     
      
     
     
              
       # data layers have no bottoms
      
     

     
      
     
     
              
       if len(bottom) != 
       0:
      
     

     
      
     
     
                  
       raise Exception(
       "Do not define a bottom.")
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # load indices for images and labels
      
     

     
      
     
     
      
               split_f  = 
       '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
      
     

     
      
     
     
      
                       self.split)
      
     

     
      
     
     
      
               self.indices = open(split_f, 
       'r').read().splitlines()
      
     

     
      
     
     
      
               self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # make eval deterministic
      
     

     
      
     
     
              
       if 
       'train' 
       not 
       in self.split:
      
     

     
      
     
     
      
                   self.random = 
       False
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # randomization: seed and pick
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   random.seed(self.seed)
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def reshape(self, bottom, top):
      
     

     
      
     
     
              
       # load image + label image pair
      
     

     
      
     
     
      
               self.data = self.load_image(self.indices[self.idx])
      
     

     
      
     
     
      
               self.label = self.load_label(self.indices[self.idx])
      
     

     
      
     
     
              
       # reshape tops to fit (leading 1 is for batch dimension)
      
     

     
      
     
     
      
               top[
       0].reshape(
       1, *self.data.shape)
      
     

     
      
     
     
      
               top[
       1].reshape(
       1, *self.label.shape)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def forward(self, bottom, top):
      
     

     
      
     
     
              
       # assign output
      
     

     
      
     
     
      
               top[
       0].data[...] = self.data
      
     

     
      
     
     
      
               top[
       1].data[...] = self.label
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # pick next input
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
              
       else:
      
     

     
      
     
     
      
                   self.idx += 
       1
      
     

     
      
     
     
                  
       if self.idx == len(self.indices):
      
     

     
      
     
     
      
                       self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def backward(self, top, propagate_down, bottom):
      
     

     
      
     
     
              
       pass
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_image(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load input image and preprocess for Caffe:
      
     

     
      
     
     
      
               - cast to float
      
     

     
      
     
     
      
               - switch channels RGB -> BGR
      
     

     
      
     
     
      
               - subtract mean
      
     

     
      
     
     
      
               - transpose to channel x height x width order
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
      
     

     
      
     
     
      
               in_ = np.array(im, dtype=np.float32)
      
     

     
      
     
     
      
               in_ = in_[:,:,::
       -1]
      
     

     
      
     
     
      
               in_ -= self.mean
      
     

     
      
     
     
      
               in_ = in_.transpose((
       2,
       0,
       1))
      
     

     
      
     
     
              
       return in_
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_label(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load label image as 1 x height x width integer array of label indices.
      
     

     
      
     
     
      
               The leading singleton dimension is required by the loss.
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
      
     

     
      
     
     
      
               label = np.array(im, dtype=np.uint8)
      
     

     
      
     
     
      
               label = label[np.newaxis, ...]
      
     

     
      
     
     
              
       return label
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
      
       class SBDDSegDataLayer(caffe.Layer):
      
     

     
      
     
     
          
       """
      
     

     
      
     
     
      
           Load (input image, label image) pairs from the SBDD extended labeling
      
     

     
      
     
     
      
           of PASCAL VOC for semantic segmentation
      
     

     
      
     
     
      
           one-at-a-time while reshaping the net to preserve dimensions.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
           Use this to feed data to a fully convolutional network.
      
     

     
      
     
     
      
           """
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def setup(self, bottom, top):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Setup data layer according to parameters:
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               - sbdd_dir: path to SBDD `dataset` dir
      
     

     
      
     
     
      
               - split: train / seg11valid
      
     

     
      
     
     
      
               - mean: tuple of mean values to subtract
      
     

     
      
     
     
      
               - randomize: load in random order (default: True)
      
     

     
      
     
     
      
               - seed: seed for randomization (default: None / current time)
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               for SBDD semantic segmentation.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
      
     

     
      
     
     
      
               Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               example
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               params = dict(sbdd_dir="/path/to/SBDD/dataset",
      
     

     
      
     
     
      
                   mean=(104.00698793, 116.66876762, 122.67891434),
      
     

     
      
     
     
      
                   split="valid")
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       # config
      
     

     
      
     
     
      
               params = eval(self.param_str)
      
     

     
      
     
     
      
               self.sbdd_dir = params[
       'sbdd_dir']
      
     

     
      
     
     
      
               self.split = params[
       'split']
      
     

     
      
     
     
      
               self.mean = np.array(params[
       'mean'])
      
     

     
      
     
     
      
               self.random = params.get(
       'randomize', 
       True)
      
     

     
      
     
     
      
               self.seed = params.get(
       'seed', 
       None)
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # two tops: data and label
      
     

     
      
     
     
              
       if len(top) != 
       2:
      
     

     
      
     
     
                  
       raise Exception(
       "Need to define two tops: data and label.")
      
     

     
      
     
     
              
       # data layers have no bottoms
      
     

     
      
     
     
              
       if len(bottom) != 
       0:
      
     

     
      
     
     
                  
       raise Exception(
       "Do not define a bottom.")
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # load indices for images and labels
      
     

     
      
     
     
      
               split_f  = 
       '{}/{}.txt'.format(self.sbdd_dir,
      
     

     
      
     
     
      
                       self.split)
      
     

     
      
     
     
      
               self.indices = open(split_f, 
       'r').read().splitlines()
      
     

     
      
     
     
      
               self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # make eval deterministic
      
     

     
      
     
     
              
       if 
       'train' 
       not 
       in self.split:
      
     

     
      
     
     
      
                   self.random = 
       False
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # randomization: seed and pick
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   random.seed(self.seed)
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def reshape(self, bottom, top):
      
     

     
      
     
     
              
       # load image + label image pair
      
     

     
      
     
     
      
               self.data = self.load_image(self.indices[self.idx])
      
     

     
      
     
     
      
               self.label = self.load_label(self.indices[self.idx])
      
     

     
      
     
     
              
       # reshape tops to fit (leading 1 is for batch dimension)
      
     

     
      
     
     
      
               top[
       0].reshape(
       1, *self.data.shape)
      
     

     
      
     
     
      
               top[
       1].reshape(
       1, *self.label.shape)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def forward(self, bottom, top):
      
     

     
      
     
     
              
       # assign output
      
     

     
      
     
     
      
               top[
       0].data[...] = self.data
      
     

     
      
     
     
      
               top[
       1].data[...] = self.label
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # pick next input
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
              
       else:
      
     

     
      
     
     
      
                   self.idx += 
       1
      
     

     
      
     
     
                  
       if self.idx == len(self.indices):
      
     

     
      
     
     
      
                       self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def backward(self, top, propagate_down, bottom):
      
     

     
      
     
     
              
       pass
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_image(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load input image and preprocess for Caffe:
      
     

     
      
     
     
      
               - cast to float
      
     

     
      
     
     
      
               - switch channels RGB -> BGR
      
     

     
      
     
     
      
               - subtract mean
      
     

     
      
     
     
      
               - transpose to channel x height x width order
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/img/{}.jpg'.format(self.sbdd_dir, idx))
      
     

     
      
     
     
      
               in_ = np.array(im, dtype=np.float32)
      
     

     
      
     
     
      
               in_ = in_[:,:,::
       -1]
      
     

     
      
     
     
      
               in_ -= self.mean
      
     

     
      
     
     
      
               in_ = in_.transpose((
       2,
       0,
       1))
      
     

     
      
     
     
              
       return in_
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_label(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load label image as 1 x height x width integer array of label indices.
      
     

     
      
     
     
      
               The leading singleton dimension is required by the loss.
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       import scipy.io
      
     

     
      
     
     
      
               mat = scipy.io.loadmat(
       '{}/cls/{}.mat'.format(self.sbdd_dir, idx))
      
     

     
      
     
     
      
               label = mat[
       'GTcls'][
       0][
       'Segmentation'][
       0].astype(np.uint8)
      
     

     
      
     
     
      
               label = label[np.newaxis, ...]
      
     

     
      
     
     
              
       return label

详细代码解读如下

1.VOCSegDataLayer类（也即net.py中申明的测试时的输入层pylayer）

此类对应于val.prototxt中的输入层，即：


   
     
      
     
     
      
       layer {
      
     

     
      
     
     
      
         name: 
       "data"
      
     

     
      
     
     
      
         type: 
       "Python"
      
     

     
      
     
     
      
         top: 
       "data"
      
     

     
      
     
     
      
         top: 
       "label"
      
     

     
      
     
     
      
         python_param {
      
     

     
      
     
     
      
           module: 
       "voc_layers"
      
     

     
      
     
     
      
           layer: 
       "VOCSegDataLayer"
      
     

     
      
     
     
      
           param_str: 
       "{\'voc_dir\': \'../data/VOC2012\', \'seed\': 1337, \'split\': \'seg11valid\', \'mean\': (104.00699, 116.66877, 122.67892)}"
      
     

     
      
     
     
      
         }
      
     

     
      
     
     
      
       }

具体源码解读如下


   
     
      
     
     
      
       #测试时用到的定义数据层的VOCSegDataLayer类（对应于测试集或验证集），类中根据caffe提供的python接口定义相
      
     

     
      
     
     
      
       #应的函数，详细可参见https://chrischoy.github.io/research/caffe-python-layer/
      
     

     
      
     
     
      
       class VOCSegDataLayer(caffe.Layer):
      
     

     
      
     
     
          
       """
      
     

     
      
     
     
      
           Load (input image, label image) pairs from PASCAL VOC
      
     

     
      
     
     
      
           one-at-a-time while reshaping the net to preserve dimensions.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
           Use this to feed data to a fully convolutional network.
      
     

     
      
     
     
      
           """
      
     

     
      
     
     
          
       #setup函数，根据相应参数设置数据层
      
     

     
      
     
     
          
       def setup(self, bottom, top):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Setup data layer according to parameters:
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               - voc_dir: path to PASCAL VOC year dir 测试集或验证集的路径
      
     

     
      
     
     
      
               - split: train / val / test split可以为train/val/test中的任意一者（即也可以看看训练集的效果）
      
     

     
      
     
     
      
               - mean: tuple of mean values to subtract  存储着所要减去的平均值（减去平均值可以加速迭代）
      
     

     
      
     
     
      
               - randomize: load in random order (default: True) 当randomize=True时，开启随机加载图片模式
      
     

     
      
     
     
      
               - seed: seed for randomization (default: None / current time) 随机模式的种子（默认值为None）
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               for PASCAL VOC semantic segmentation.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               example
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               params = dict(voc_dir="/path/to/PASCAL/VOC2011",
      
     

     
      
     
     
      
                   mean=(104.00698793, 116.66876762, 122.67891434),
      
     

     
      
     
     
      
                   split="val")
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       # config
      
     

     
      
     
     
      
               params = eval(self.param_str) 
       #读入参数
      
     

     
      
     
     
      
               self.voc_dir = params[
       'voc_dir']
      
     

     
      
     
     
      
               self.split = params[
       'split']
      
     

     
      
     
     
      
               self.mean = np.array(params[
       'mean'])
      
     

     
      
     
     
      
               self.random = params.get(
       'randomize', 
       True)
      
     

     
      
     
     
      
               self.seed = params.get(
       'seed', 
       None)
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # two tops: data and label
      
     

     
      
     
     
              
       #判断输出是否包含数据和标记
      
     

     
      
     
     
              
       if len(top) != 
       2:
      
     

     
      
     
     
                  
       raise Exception(
       "Need to define two tops: data and label.")
      
     

     
      
     
     
              
       # data layers have no bottoms
      
     

     
      
     
     
              
       #判断是否有输入（数据层不需要定义输入bottom）
      
     

     
      
     
     
              
       if len(bottom) != 
       0:
      
     

     
      
     
     
                  
       raise Exception(
       "Do not define a bottom.")
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # load indices for images and labels
      
     

     
      
     
     
              
       #获取所需要加载的图片的编号（即读取'split'.txt文档中的图片索引编号，这些编号其实是图片名）
      
     

     
      
     
     
      
               split_f  = 
       '{}/ImageSets/Segmentation/{}.txt'.format(self.voc_dir,
      
     

     
      
     
     
      
                       self.split) 
       #第一个{}即self.voc_dir;第二个{}即self.split
      
     

     
      
     
     
              
       #splitlines()，按行('\r', '\r\n', \n')分隔，返回一个包含各行作为元素的列表
      
     

     
      
     
     
              
       #即indices是所有图片编号的列表（按行存放成一列）
      
     

     
      
     
     
      
               self.indices = open(split_f, 
       'r').read().splitlines()
      
     

     
      
     
     
      
               self.idx = 
       0  
       #indices列表索引指针idx初始化为0
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # make eval deterministic
      
     

     
      
     
     
              
       #当split=test或者val时，不需要开启随机模式
      
     

     
      
     
     
              
       if 
       'train' 
       not 
       in self.split:
      
     

     
      
     
     
      
                   self.random = 
       False
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # randomization: seed and pick
      
     

     
      
     
     
              
       #判断是否开启随机读取图片模式
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   random.seed(self.seed)
      
     

     
      
     
     
                  
       #随机生成一个整数作为索引号idx(范围为0~(len(self.indices)-1))
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1) 
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       #reshape函数，根据索引号idx加载相应图片，并调整数据层的大小
      
     

     
      
     
     
          
       def reshape(self, bottom, top):
      
     

     
      
     
     
              
       # load image + label image pair
      
     

     
      
     
     
              
       #load_image()和load_label()函数在后面定义
      
     

     
      
     
     
      
               self.data = self.load_image(self.indices[self.idx])
      
     

     
      
     
     
      
               self.label = self.load_label(self.indices[self.idx])
      
     

     
      
     
     
              
       # reshape tops to fit (leading 1 is for batch dimension)
      
     

     
      
     
     
              
       '''
      
     

     
      
     
     
      
               重新调整数据层的大小（即caffe所加载的数据层的大小在每次迭代训练中是可以变的，
      
     

     
      
     
     
      
               因为数据层的大小并不影响各层参数的大小）
      
     

     
      
     
     
      
               caffe中的数据按N*C*H*W存储的，N为batch size,C为通道数，H和W分别为长和宽，这里的1即为batch size
      
     

     
      
     
     
      
               也就对应了FCN论文中所讲到的采用SGD算法（随机梯度下降法，每一迭代训练的图片数为1）
      
     

     
      
     
     
      
               '''
      
     

     
      
     
     
      
               top[
       0].reshape(
       1, *self.data.shape)  
       #data
      
     

     
      
     
     
      
               top[
       1].reshape(
       1, *self.label.shape) 
       #label
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       #定义前向传播函数forward(),数据层的前向传播不对数据进行任何操作，只是简单的输出数据本身
      
     

     
      
     
     
          
       def forward(self, bottom, top):
      
     

     
      
     
     
              
       # assign output
      
     

     
      
     
     
      
               top[
       0].data[...] = self.data
      
     

     
      
     
     
      
               top[
       1].data[...] = self.label
      
     

     
      
     
     
              
       #输出数据的同时，进行下一次迭代时所需要的图片的选择（即产生下一个索引号idx）
      
     

     
      
     
     
              
       # pick next input
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
              
       else:
      
     

     
      
     
     
      
                   self.idx += 
       1
      
     

     
      
     
     
                  
       if self.idx == len(self.indices):
      
     

     
      
     
     
      
                       self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       #数据层不需要后向传播，直接pass
      
     

     
      
     
     
          
       def backward(self, top, propagate_down, bottom):
      
     

     
      
     
     
              
       pass
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       #加载图片的函数（根据索引号idx进行加载）
      
     

     
      
     
     
          
       def load_image(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load input image and preprocess for Caffe: 加载图片并处理成caffe的数据格式
      
     

     
      
     
     
      
               - cast to float 转换为float型
      
     

     
      
     
     
      
               - switch channels RGB -> BGR 交换通道位置，即R通道和B通道交换（感觉是用了opencv库的原因）
      
     

     
      
     
     
      
               - subtract mean  减去均值
      
     

     
      
     
     
      
               - transpose to channel x height x width order 将通道数放在前面（对应caffe数据存储的格式）
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/JPEGImages/{}.jpg'.format(self.voc_dir, idx))
      
     

     
      
     
     
      
               in_ = np.array(im, dtype=np.float32)
      
     

     
      
     
     
      
               in_ = in_[:,:,::
       -1]  
       #-1表示从最后一维开始往前读取数据。即交换R通道和B通道
      
     

     
      
     
     
      
               in_ -= self.mean  
       #减去均值
      
     

     
      
     
     
      
               in_ = in_.transpose((
       2,
       0,
       1)) 
       #将通道数放在前面
      
     

     
      
     
     
              
       return in_

python中的PIL所读取的三通道彩色图片是按H*W*C存放的，且三通道是顺序是标准的RGB顺序，输入到caffe中处理前，需要进行相应的转换。

caffe中的数据存储方式是N*C*H*W，且是按BGR顺序存放三通道的。

所有需要先进行RGB转换到BGR，具体可直接使用 in_ = in_[:,:,::-1]语句实现，具体理解可参见以下例子（其中a的第三维可看成是C，且按RGB顺序存放，前两维可看成是H和W）：


   
     
      
     
     
      
       import numpy 
       as np
      
     

     
      
     
     
      
       a = np.array([[[
       1,
       2,
       3],[
       4,
       5,
       6]],[[
       7,
       8,
       9],[
       10,
       11,
       12]],[[
       13,
       14,
       15],[
       16,
       17,
       18]]])
      
     

     
      
     
     
      
       print(str(a.shape))
      
     

     
      
     
     
      
       print(str(a))
      
     

     
      
     
     
      
       a = a[:,:,::
       -1] 
      
     

     
      
     
     
      
       #a = a.transpose((2,0,1))
      
     

     
      
     
     
      
       print(str(a.shape))
      
     

     
      
     
     
      
       print(str(a))

以这个例子来说，第一行第一列所在位置的像素点的像素值分别为：R=1，G=2，B=3

运行结果为（可以看出第一行第一列所在位置的像素点的像素值分别为：B=3，G=2，R=1）：

在此基础上还需要减去各个通道的均值，来进行均值归一化来加速算法执行速度。

最后按照caffe的存储数据的格式将通道数放在前面，即利用python中的transpose()函数进行转置操作，具体理解参见以下例子：


   
     
      
     
     
      
       import numpy 
       as np
      
     

     
      
     
     
      
       a = np.array([[[
       1,
       2,
       3],[
       4,
       5,
       6]],[[
       7,
       8,
       9],[
       10,
       11,
       12]],[[
       13,
       14,
       15],[
       16,
       17,
       18]]])
      
     

     
      
     
     
      
       print(str(a.shape))
      
     

     
      
     
     
      
       print(str(a))
      
     

     
      
     
     
      
       a = a[:,:,::
       -1] 
      
     

     
      
     
     
      
       a = a.transpose((
       2,
       0,
       1))
      
     

     
      
     
     
      
       print(str(a.shape))
      
     

     
      
     
     
      
       print(str(a))

运行结果如下（由此可看出原来的第三维变到了第一维）：

即，例如结果中的

表示的是所有像素点的B通道的像素数值，也即表示的是原图像的B通道。


   
     
      
     
     
          
       #加载标记的函数（按照索引号idx加载相应的label图片）
      
     

     
      
     
     
          
       def load_label(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load label image as 1 x height x width integer array of label indices.
      
     

     
      
     
     
      
               The leading singleton dimension is required by the loss.
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/SegmentationClass/{}.png'.format(self.voc_dir, idx))
      
     

     
      
     
     
      
               label = np.array(im, dtype=np.uint8)  
       #标签是单通道的
      
     

     
      
     
     
              
       #np.newaxis的功能是插入新维度，即将原来的H×W转换为1×H×W
      
     

     
      
     
     
      
               label = label[np.newaxis, ...]
      
     

     
      
     
     
              
       return label

2.SBDDSegDataLayer类（也即net.py中申明的训练时的输入层pylayer）

此类对应于train.prototxt中的输入层，即：


   
     
      
     
     
      
       layer {
      
     

     
      
     
     
      
         name: 
       "data"
      
     

     
      
     
     
      
         type: 
       "Python"
      
     

     
      
     
     
      
         top: 
       "data"
      
     

     
      
     
     
      
         top: 
       "label"
      
     

     
      
     
     
      
         python_param {
      
     

     
      
     
     
      
           module: 
       "voc_layers"
      
     

     
      
     
     
      
           layer: 
       "SBDDSegDataLayer"
      
     

     
      
     
     
      
           param_str: 
       "{\'sbdd_dir\': \'../data/VOC2012\', \'seed\': 1337, \'split\': \'train\', \'mean\': (104.00699, 116.66877, 122.67892)}"
      
     

     
      
     
     
      
         }
      
     

     
      
     
     
      
       }

SBDDSegDataLayer类的代码和VOCSegDataLayer类类似，在此不再重复解读，就其中的一小点进行说明。


   
     
      
     
     
      
       #训练时用到的定义数据层的SBDDSegDataLayer类（对应于训练集），类中根据caffe提供的python接口定义相
      
     

     
      
     
     
      
       #应的函数
      
     

     
      
     
     
      
       class SBDDSegDataLayer(caffe.Layer):
      
     

     
      
     
     
          
       """
      
     

     
      
     
     
      
           Load (input image, label image) pairs from the SBDD extended labeling
      
     

     
      
     
     
      
           of PASCAL VOC for semantic segmentation
      
     

     
      
     
     
      
           one-at-a-time while reshaping the net to preserve dimensions.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
           Use this to feed data to a fully convolutional network.
      
     

     
      
     
     
      
           """
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def setup(self, bottom, top):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Setup data layer according to parameters:
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               - sbdd_dir: path to SBDD `dataset` dir
      
     

     
      
     
     
      
               - split: train / seg11valid
      
     

     
      
     
     
      
               - mean: tuple of mean values to subtract
      
     

     
      
     
     
      
               - randomize: load in random order (default: True)
      
     

     
      
     
     
      
               - seed: seed for randomization (default: None / current time)
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               for SBDD semantic segmentation.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               N.B.segv11alid is the set of segval11 that does not intersect with SBDD.
      
     

     
      
     
     
      
               Find it here: https://gist.github.com/shelhamer/edb330760338892d511e.
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               example
      
     

     
      
     
     
      
       
      
     

     
      
     
     
      
               params = dict(sbdd_dir="/path/to/SBDD/dataset",
      
     

     
      
     
     
      
                   mean=(104.00698793, 116.66876762, 122.67891434),
      
     

     
      
     
     
      
                   split="valid")
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       # config
      
     

     
      
     
     
      
               params = eval(self.param_str)
      
     

     
      
     
     
      
               self.sbdd_dir = params[
       'sbdd_dir']
      
     

     
      
     
     
      
               self.split = params[
       'split']
      
     

     
      
     
     
      
               self.mean = np.array(params[
       'mean'])
      
     

     
      
     
     
      
               self.random = params.get(
       'randomize', 
       True)
      
     

     
      
     
     
      
               self.seed = params.get(
       'seed', 
       None)
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # two tops: data and label
      
     

     
      
     
     
              
       if len(top) != 
       2:
      
     

     
      
     
     
                  
       raise Exception(
       "Need to define two tops: data and label.")
      
     

     
      
     
     
              
       # data layers have no bottoms
      
     

     
      
     
     
              
       if len(bottom) != 
       0:
      
     

     
      
     
     
                  
       raise Exception(
       "Do not define a bottom.")
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # load indices for images and labels
      
     

     
      
     
     
      
               split_f  = 
       '{}/{}.txt'.format(self.sbdd_dir,
      
     

     
      
     
     
      
                       self.split)
      
     

     
      
     
     
      
               self.indices = open(split_f, 
       'r').read().splitlines()
      
     

     
      
     
     
      
               self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # make eval deterministic
      
     

     
      
     
     
              
       if 
       'train' 
       not 
       in self.split:
      
     

     
      
     
     
      
                   self.random = 
       False
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # randomization: seed and pick
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   random.seed(self.seed)
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def reshape(self, bottom, top):
      
     

     
      
     
     
              
       # load image + label image pair
      
     

     
      
     
     
      
               self.data = self.load_image(self.indices[self.idx])
      
     

     
      
     
     
      
               self.label = self.load_label(self.indices[self.idx])
      
     

     
      
     
     
              
       # reshape tops to fit (leading 1 is for batch dimension)
      
     

     
      
     
     
      
               top[
       0].reshape(
       1, *self.data.shape)
      
     

     
      
     
     
      
               top[
       1].reshape(
       1, *self.label.shape)
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def forward(self, bottom, top):
      
     

     
      
     
     
              
       # assign output
      
     

     
      
     
     
      
               top[
       0].data[...] = self.data
      
     

     
      
     
     
      
               top[
       1].data[...] = self.label
      
     

     
      
     
     
       
      
     

     
      
     
     
              
       # pick next input
      
     

     
      
     
     
              
       if self.random:
      
     

     
      
     
     
      
                   self.idx = random.randint(
       0, len(self.indices)
       -1)
      
     

     
      
     
     
              
       else:
      
     

     
      
     
     
      
                   self.idx += 
       1
      
     

     
      
     
     
                  
       if self.idx == len(self.indices):
      
     

     
      
     
     
      
                       self.idx = 
       0
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def backward(self, top, propagate_down, bottom):
      
     

     
      
     
     
              
       pass
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_image(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load input image and preprocess for Caffe:
      
     

     
      
     
     
      
               - cast to float
      
     

     
      
     
     
      
               - switch channels RGB -> BGR
      
     

     
      
     
     
      
               - subtract mean
      
     

     
      
     
     
      
               - transpose to channel x height x width order
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/img/{}.jpg'.format(self.sbdd_dir, idx))
      
     

     
      
     
     
      
               in_ = np.array(im, dtype=np.float32)
      
     

     
      
     
     
      
               in_ = in_[:,:,::
       -1]
      
     

     
      
     
     
      
               in_ -= self.mean
      
     

     
      
     
     
      
               in_ = in_.transpose((
       2,
       0,
       1))
      
     

     
      
     
     
              
       return in_
      
     

     
      
     
     
       
      
     

     
      
     
     
       
      
     

     
      
     
     
          
       def load_label(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load label image as 1 x height x width integer array of label indices.
      
     

     
      
     
     
      
               The leading singleton dimension is required by the loss.
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
              
       import scipy.io
      
     

     
      
     
     
      
               mat = scipy.io.loadmat(
       '{}/cls/{}.mat'.format(self.sbdd_dir, idx)) 
       #训练集的标签为.mat格式
      
     

     
      
     
     
      
               label = mat[
       'GTcls'][
       0][
       'Segmentation'][
       0].astype(np.uint8)
      
     

     
      
     
     
      
               label = label[np.newaxis, ...]
      
     

     
      
     
     
              
       return label

SBDDSegDataLayer类所加载的训练样本的标记图片是按Matlab的mat进行存储的，但实际使用时，我们没有必要按照mat格式来加载标记图片，可参见VOCSegDataLayer类直接读取.png或.jpg格式的标记图片，即可将这个load_label()函数修改为：


   
     
      
     
     
         
       def load_label(self, idx):
      
     

     
      
     
     
              
       """
      
     

     
      
     
     
      
               Load label image as 1 x height x width integer array of label indices.
      
     

     
      
     
     
      
               The leading singleton dimension is required by the loss.
      
     

     
      
     
     
      
               """
      
     

     
      
     
     
      
               im = Image.open(
       '{}/SegmentationClass/{}.png'.format(self.sbdd_dir, idx))
      
     

     
      
     
     
      
               label = np.array(im, dtype=np.uint8)
      
     

     
      
     
     
      
               label = label[np.newaxis, ...]
      
     

     
      
     
     
              
       return label