tensorflow图像处理：图像增广（image augmentation）

最新推荐文章于 2025-01-21 11:02:22 发布

空腹熊猫

最新推荐文章于 2025-01-21 11:02:22 发布

阅读量1.6k

点赞数

分类专栏：图像处理 TensorFlow

原文链接：https://trickygo.github.io/Dive-into-DL-TensorFlow2.0/#/chapter09_computer-vision/9.1_image-augmentation?id=%e5%b8%b8%e7%94%a8%e7%9a%84%e5%9b%be%e5%83%8f%e5%a2%9e%e5%b9%bf%e6%96%b9%e6%b3%95

版权

图像处理同时被 2 个专栏收录

12 篇文章

订阅专栏

TensorFlow

5 篇文章

订阅专栏

本文探讨了图像增广在深度学习中的重要性，通过随机变换训练图像，如翻转、裁剪和颜色调整，来扩大数据集规模，提高模型泛化能力。以TensorFlow为例，展示了如何实施这些技术，并在CIFAR-10数据集上训练ResNet-18模型。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

摘自《动手学习深度学习》tensorflow版本（2.1.0）

图像增广意义

大规模数据集是成功应用深度神经网络的前提。
图像增广（image augmentation）技术通过对训练图像做一系列随机改变，来产生相似但又不同的训练样本，从而扩大训练数据集的规模。
图像增广的另一种解释是，随机改变训练样本可以降低模型对某些属性的依赖，从而提高模型的泛化能力。例如：
- 对图像进行不同方式的裁剪，使感兴趣的物体出现在不同位置，从而减轻模型对物体出现位置的依赖性。
- 调整亮度、色彩等因素来降低模型对色彩的敏感度。

import tensorflow as tf
import numpy as np
print(tf.__version__)

读取一张图片

from matplotlib import pyplot as plt
img = plt.imread('./hotdog/train/hotdog/0.png')
plt.imshow(img)

定义对原图进行指定变换并显示的函数：

def show_images(imgs, num_rows, num_cols, scale=2): #显示imgs中的num_rows * num_cols张图
    figsize = (num_cols * scale, num_rows * scale)
    _, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    for i in range(num_rows):
        for j in range(num_cols):
            axes[i][j].imshow(imgs[i * num_cols + j])
            axes[i][j].axes.get_xaxis().set_visible(False) # 不显示x轴
            axes[i][j].axes.get_yaxis().set_visible(False) # 不显示y轴
    return axes

def apply(img, aug, num_rows=2, num_cols=4, scale=1.5): # 对img 做num_rows*num_cols次aug变换
    Y = [aug(img) for _ in range(num_rows * num_cols)]
    show_images(Y, num_rows, num_cols, scale)

一半概率的左右翻转

apply(img, tf.image.random_flip_left_right)

一半概率上下翻转

apply(img, tf.image.random_flip_up_down)

随机裁剪

图像实体不总出现在正中央，池化层可以降低网络对目标位置的敏感度，还可以用随机裁剪（物体按不同比例出现在不同位置）来达到相同目的。

随机裁剪，选择宽和高分别为100像素（参数必须比原图小）（tf 没有mxnet里面的选项：面积为原来10%∼100% 的区域，宽高比随机取自0.5∼2）random_crop api

aug=tf.image.random_crop
num_rows=2
num_cols=4
scale=1.5
crop_size=100  #必须比原图的高宽小

Y = [aug(img, (crop_size, crop_size, 3)) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)

颜色变化

可从4个方面变化：亮度、对比度、饱和度和色调

将图像的亮度随机变化为原图亮度的50%（即1−0.5）∼150%（即1+0.5）

aug=tf.image.random_brightness
num_rows=2
num_cols=4
scale=1.5
max_delta=0.5

Y = [aug(img, max_delta) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)

色调变化

delta 参数同颜色变化

aug=tf.image.random_hue
num_rows=2
num_cols=4
scale=1.5
max_delta=0.5

Y = [aug(img, max_delta) for _ in range(num_rows * num_cols)]
show_images(Y, num_rows, num_cols, scale)

使用图像增广训练模型

在CIFAR-10数据集上训练ResNet-18 模型，使用左右随机翻转增广

(x, y), (test_x, test_y) = tf.keras.datasets.cifar10.load_data()
print(x.shape, test_x.shape)
show_images(x[0:32][0], 4, 8, scale=0.8)

from tensorflow.keras import layers,activations
class Residual(tf.keras.Model):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs):
        super(Residual, self).__init__(**kwargs)
        self.conv1 = layers.Conv2D(num_channels,
                                   padding='same',
                                   kernel_size=3,
                                   strides=strides)
        self.conv2 = layers.Conv2D(num_channels, kernel_size=3,padding='same')
        if use_1x1conv:
            self.conv3 = layers.Conv2D(num_channels,
                                       kernel_size=1,
                                       strides=strides)
        else:
            self.conv3 = None
        self.bn1 = layers.BatchNormalization()
        self.bn2 = layers.BatchNormalization()

    def call(self, X):
        Y = activations.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.conv3:
            X = self.conv3(X)
        return activations.relu(Y + X)

class ResnetBlock(tf.keras.layers.Layer):
    def __init__(self,num_channels, num_residuals, first_block=False,**kwargs):
        super(ResnetBlock, self).__init__(**kwargs)
        self.listLayers=[]
        for i in range(num_residuals):
            if i == 0 and not first_block:
                self.listLayers.append(Residual(num_channels, use_1x1conv=True, strides=2))
            else:
                self.listLayers.append(Residual(num_channels))      

    def call(self, X):
        for layer in self.listLayers.layers:
            X = layer(X)
        return X

class ResNet(tf.keras.Model):
    def __init__(self,num_blocks,**kwargs):
        super(ResNet, self).__init__(**kwargs)
        self.conv=tf.keras.layers.Conv2D(64, kernel_size=7, strides=2, padding='same')
        self.bn=tf.keras.layers.BatchNormalization()
        self.relu=tf.keras.layers.Activation('relu')
        self.mp=tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding='same')
        self.resnet_block1=ResnetBlock(64,num_blocks[0], first_block=True)
        self.resnet_block2=ResnetBlock(128,num_blocks[1])
        self.resnet_block3=ResnetBlock(256,num_blocks[2])
        self.resnet_block4=ResnetBlock(512,num_blocks[3])
        self.gap=tf.keras.layers.GlobalAvgPool2D()
        self.fc=tf.keras.layers.Dense(units=10,activation=tf.keras.activations.softmax)

    def call(self, x):
        x=self.conv(x)
        x=self.bn(x)
        x=self.relu(x)
        x=self.mp(x)
        x=self.resnet_block1(x)
        x=self.resnet_block2(x)
        x=self.resnet_block3(x)
        x=self.resnet_block4(x)
        x=self.gap(x)
        x=self.fc(x)
        return x

net = ResNet([2,2,2,2])

x = [tf.image.random_flip_left_right(i) for i in x] #左右随机翻转
net.compile(loss='sparse_categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

history = net.fit(x, y,
                    batch_size=64,
                    epochs=5,
                    validation_split=0.2)
test_scores = net.evaluate(test_x, test_y, verbose=2)