现代卷积神经网络

haodouxiansheng

于 2024-05-22 21:31:30 发布

阅读量1.5k

点赞数 34

文章标签： cnn 深度学习人工智能

本文链接：https://blog.youkuaiyun.com/haodouxiansheng/article/details/139127431

版权

LeNet

总体来看，LeNet（LeNet-5）由两个部分组成：

• 卷积编码器：由两个卷积层组成;

• 全连接层密集块：由三个全连接层组成。

代码

import torch
from torch import nn
from d2l import torch as d2l

class Reshape(torch.nn.Module):
    def forward(self, x):
        return x.view(-1, 1, 28, 28)

#LeNet网络
net = torch.nn.Sequential(Reshape(), nn.Conv2d(1, 6, kernel_size=5,
                                               padding=2), nn.Sigmoid(),
                          nn.AvgPool2d(kernel_size=2, stride=2),
                          nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
                          nn.AvgPool2d(kernel_size=2, stride=2), nn.Flatten(),
                          nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
                          nn.Linear(120, 84), nn.Sigmoid(), nn.Linear(84, 10))

# 输出模型流程
X = torch.rand(size=(1, 1, 28, 28), dtype=torch.float32)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape: \t', X.shape)


# LeNet在Fashion-MNIST数据集上的表现

#加载数据
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

#使用GPU进行运算，修改评估函数
def evaluate_accuracy_gpu(net, data_iter, device=None):  
    """使用GPU计算模型在数据集上的精度。"""
    if isinstance(net, torch.nn.Module):
        net.eval()
        if not device:
            device = next(iter(net.parameters())).device
    metric = d2l.Accumulator(2)
    for X, y in data_iter:
        if isinstance(X, list):
            X = [x.to(device) for x in X]
        else:
            X = X.to(device)
        y = y.to(device)
        metric.add(d2l.accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

#训练函数
def train_ch6(net, train_iter, test_iter, num_epochs, lr, device):
    """用GPU训练模型(在第六章定义)。"""

    def init_weights(m):
        if type(m) == nn.Linear or type(m) == nn.Conv2d:
            nn.init.xavier_uniform_(m.weight) #初始化权重w
            """
            Xavier初始化的实现
            权重的赋值会按照均匀分布或正态分布
            Xavier初始化的目的 
            控制方差：它旨在维持各层激活和反向传播梯度的方差。
                            通过这样做，它有助于防止在深度网络中常见的梯度消失或梯度爆炸问题。
            有效训练：适当的初始化可以导致训练更快收敛，
                            因为它确保梯度不会太快消失或爆炸。
            """

    #net模型使用init_weights函数
    net.apply(init_weights)
    print('training on', device)
    
    #将模型移动到特定设备device上
    net.to(device)

    #采用SGD（随机梯度下降）更新模型参数
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()

    #图形化
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'test acc'])
    timer, num_batches = d2l.Timer(), len(train_iter)
    
    for epoch in range(num_epochs):
        metric = d2l.Accumulator(3)
        net.train()
        for i, (X, y) in enumerate(train_iter):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                metric.add(l * X.shape[0], d2l.accuracy(y_hat, y), X.shape[0])
            timer.stop()
            train_l = metric[0] / metric[2]
            train_acc = metric[1] / metric[2]
            
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (train_l, train_acc, None))
        test_acc = evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, '
          f'test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec '
          f'on {str(device)}')

# 训练并评估
lr, num_epochs = 0.9, 10
train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

AlexNet

1. AlexNet是更大更深的LeNet

2. 新引进了Dropout，ReLU激活函数，最大池化层，数据增强

代码

import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2), nn.Flatten(),
    nn.Linear(6400, 4096), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(4096, 4096), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(4096, 10))

#观察每一层的输出
X = torch.randn(1, 1, 224, 224)
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'Output shape:\t', X.shape)

# 加载数据集
batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

# 训练
lr, num_epochs = 0.01, 10
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

VGG

1.VGG使用可重复使用的卷积块来构建深度卷积神经网络

(将AlexNet的卷积层-池化层模块化，整合成一个VGG块，通过调用VGG块实现该过程的重复进行)

2.不同的卷积块个数和超参数可以得到不同复杂度的变种

代码

import torch
from torch import nn
from d2l import torch as d2l

#VGG块
#num_convs 卷积层个数
def vgg_block(num_convs, in_channels, out_channels):
    layers = []
    for _ in range(num_convs):
        #卷积层
        layers.append(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        #激活函数
        layers.append(nn.ReLU())
        in_channels = out_channels

    #池化层
    layers.append(nn.MaxPool2d(kernel_size=2, stride=2))
    return nn.Sequential(*layers)

#VGG网络

# 每一个VGG块的参数
# (1,64) num_convs=1 out_channels=64
conv_arch = ((1, 64), (1, 128), (2, 256), (2, 512), (2, 512))

def vgg(conv_arch):
    conv_blks = []
    in_channels = 1
    for (num_convs, out_channels) in conv_arch:
        conv_blks.append(vgg_block(num_convs, in_channels, out_channels))
        in_channels = out_channels

    return nn.Sequential(*conv_blks, nn.Flatten(),
                         nn.Linear(out_channels * 7 * 7, 4096), nn.ReLU(),
                         nn.Dropout(0.5), nn.Linear(4096, 4096), nn.ReLU(),
                         nn.Dropout(0.5), nn.Linear(4096, 10))

net = vgg(conv_arch)

#观察输出情况
X = torch.randn(size=(1, 1, 224, 224))
for blk in net:
    X = blk(X)
    print(blk.__class__.__name__, 'output shape:\t', X.shape)

#由于VGG-11比AlexNet计算量更大，因此我们构建了一个通道数较少的网络
ratio = 4
small_conv_arch = [(pair[0], pair[1] // ratio) for pair in conv_arch]
net = vgg(small_conv_arch)

#训练
lr, num_epochs, batch_size = 0.05, 10, 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

NiN

1. NiN块使用卷积层+1*1卷积层（对每个像素增加了非线性性）

2. NiN使用全局平均池化层来代替VGG和AlexNet中的全连接层

3. 不容易过拟合，更少的参数个数

代码

import torch
from torch import nn
from d2l import torch as d2l

#NiN块
def nin_block(in_channels, out_channels, kernel_size, strides, padding):
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, strides, padding),
        nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU(), nn.Conv2d(out_channels, out_channels, kernel_size=1),
        nn.ReLU())

#NiN模型
net = nn.Sequential(
    nin_block(1, 96, kernel_size=11, strides=4, padding=0),
    nn.MaxPool2d(3, stride=2),
    nin_block(96, 256, kernel_size=5, strides=1, padding=2),
    nn.MaxPool2d(3, stride=2),
    nin_block(256, 384, kernel_size=3, strides=1, padding=1),
    nn.MaxPool2d(3, stride=2), nn.Dropout(0.5),
    nin_block(384, 10, kernel_size=3, strides=1, padding=1),
    nn.AdaptiveAvgPool2d((1, 1)),
    nn.Flatten())

#查看每个块输出的情况
X = torch.rand(size=(1, 1, 224, 224))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape:\t', X.shape)

#训练模型
lr, num_epochs, batch_size = 0.1, 10, 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

GoogleNet

1. 在GoogLeNet中，基本的卷积块被称为Inception块（Inception block）。

2. Inception块由四条并⾏路径组成（每一条路径输出的高宽维度一致，通道数累加形成concatenation）

3. GoogLeNet⼀共使⽤9个Inception块和全局平均汇聚层的堆叠来⽣成其估计值

4. Inception变种

1）Inception-BN(v2) 使用batch normalization

2）Inception-v3 修改了Inception块

替换5 * 5为多个3 * 3 卷积层

替换5 * 5为1 * 7和7 * 1卷积层

替换3 * 3为1 * 3和3 * 1卷积层

更深

3）Inception-V4 使用残差连接

代码

import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

# Inception块 实现图一的逻辑
class Inception(nn.Module):
    #c代表输出通道数 c1，c2，c3，c4分别代表四个并行的各自的输出通道数
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
        super(Inception, self).__init__(**kwargs)
        # 图一中第一条通路
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
        # 图一中第二条通路
        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1) #第二条通路的第一步
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1) #第二条通路的第二步
        # 图一中第三条通路
        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
        # 图一中第四条通路
        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        # p1、p2、p3、p4代表四条不同的通路
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x)))) 
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))
        return torch.cat((p1, p2, p3, p4), dim=1) # cat将四个并行计算的输出，按通道维度结合
    
# GoogLeNet模型 实现图二的逻辑
# b=stage 
# b1就相当于stage1
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                   nn.ReLU(), nn.MaxPool2d(kernel_size=3, stride=2,
                                           padding=1))

b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1), nn.ReLU(),
                   nn.Conv2d(64, 192, kernel_size=3, padding=1),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
                   Inception(256, 128, (128, 192), (32, 96), 64),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
                   Inception(512, 160, (112, 224), (24, 64), 64),
                   Inception(512, 128, (128, 256), (24, 64), 64),
                   Inception(512, 112, (144, 288), (32, 64), 64),
                   Inception(528, 256, (160, 320), (32, 128), 128),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
                   Inception(832, 384, (192, 384), (48, 128), 128),
                   nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten())

net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))

# 为了使Fashion-MNIST上的训练短小精悍，将输入的高和宽从224降到96
X = torch.rand(size=(1, 1, 96, 96))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape:\t', X.shape)

#训练模型
lr, num_epochs, batch_size = 0.1, 10, 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

批量归一化

1. 提出目的

训练深层神经⽹络是⼗分困难的，特别是在较短的时间内使他们收敛更加棘⼿。批量规范化（batch normalization）可持续加速深层⽹络的收敛速度。

2. 批量归一化

$x_{i+1} = \gamma \frac{x_i - \mu}{ \sigma} + \beta$

$\mu$ 是小批量样本的均值； $\sigma$ + $\epsilon$ 是小批量样本的方差（+ $\epsilon$ 是为了防止 $\sigma$ 为0，导致除0）； $\gamma$ 和 $\beta$ 是可学习参数

3. 批量归一化层

作用时间

（1）全连接层和卷积层输出上，激活函数之前

（2）全量阶层和卷积层输入上

作用维度

（1）对于全连接层，作用在特征维

（2）对于卷积层，作用在通道维

4. 作用

（1）通过选取不同的小批量数据计算均值和方差，相当于加入了噪音来控制模型的复杂度

（2）可以加速收敛速度，但一般不改变模型精度

代码

import torch
from torch import nn
from d2l import torch as d2l

# moving_mean 全局均值 moving_var全局方差 eps防止方差为0的加数 
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 通过is_grad_enabled来判断当前模式是训练模式还是预测模式

    if not torch.is_grad_enabled():
        # 如果是在预测模式下，直接使⽤传⼊的移动平均所得的均值和⽅差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)

    else:
        assert len(X.shape) in (2, 4)

        if len(X.shape) == 2:
            # 使⽤全连接层的情况，计算特征维上的均值和⽅差
            mean = X.mean(dim=0)
            var = ((X - mean) ** 2).mean(dim=0)

        else:
            # 使⽤⼆维卷积层的情况，计算通道维上（axis=1）的均值和⽅差。
            # 这⾥我们需要保持X的形状以便后⾯可以做⼴播运算
            mean = X.mean(dim=(0, 2, 3), keepdim=True)
            var = ((X - mean) ** 2).mean(dim=(0, 2, 3), keepdim=True)

        # 训练模式下，⽤当前的均值和⽅差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        
        # 更新移动平均的均值和⽅差
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var

    Y = gamma * X_hat + beta# 缩放和移位

    return Y, moving_mean.data, moving_var.data

#BatchNorm层
class BatchNorm(nn.Module):
    # num_features：完全连接层的输出数量或卷积层的输出通道数。
    # num_dims：2表⽰完全连接层，4表⽰卷积层
    def __init__(self, num_features, num_dims):
        super().__init__()
        if num_dims == 2:
            shape = (1, num_features)
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成1和0
        self.gamma = nn.Parameter(torch.ones(shape)) #声明为参数，需要更新
        self.beta = nn.Parameter(torch.zeros(shape))
        # ⾮模型参数的变量初始化为0和1
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.ones(shape)

    def forward(self, X):
        # 如果X不在内存上，将moving_mean和moving_var
        # 复制到X所在显存上
        if self.moving_mean.device != X.device:
            self.moving_mean = self.moving_mean.to(X.device)
            self.moving_var = self.moving_var.to(X.device)
        # 保存更新过的moving_mean和moving_var
        Y, self.moving_mean, self.moving_var = batch_norm(
            X, self.gamma, self.beta, self.moving_mean, self.moving_var,
            eps=1e-5, momentum=0.9)
        return Y

应用BatchNorm在LeNet

net = nn.Sequential(nn.Conv2d(1, 6, kernel_size=5), BatchNorm(6, num_dims=4),
                    nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
                    nn.Conv2d(6, 16,
                              kernel_size=5), BatchNorm(16, num_dims=4),
                    nn.Sigmoid(), nn.MaxPool2d(kernel_size=2, stride=2),
                    nn.Flatten(), nn.Linear(16 * 4 * 4, 120),
                    BatchNorm(120, num_dims=2), nn.Sigmoid(),
                    nn.Linear(120, 84), BatchNorm(84, num_dims=2),
                    nn.Sigmoid(), nn.Linear(84, 10))

# 训练
lr, num_epochs, batch_size = 1.0, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())
#看参数gamma和参数beta
net[1].gamma.reshape((-1,)), net[1].beta.reshape((-1,))

ResNet

1. 随着网络加深，我们无法保证越深的神经网络就一定比浅层的神经网络更优。例如，对于⾮嵌套函数类，较复杂（由较⼤区域表⽰）的函数类不能保证更接近“真”函数（f ∗ ），而嵌套函数可以很好的解决这个问题。

2. 残差块

将第k+1块的输入（即第k块的输出）直接与第k+1块的输出结合，作为第k+1块的输出。

$F_{k+1}(x) = F_{k+1}(F_{k})+F_{k}(x)$

3.ResNet框架

类似于VGG，将VGG块替换成了ResNet块

代码

import torch
from torch import nn
from torch.nn import functional as F
from d2l import torch as d2l

#ResNet块
class Residual(nn.Module):  
    def __init__(self, input_channels, num_channels, use_1x1conv=False,strides=1):
        super().__init__()
        #F_{k+1}
        self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3,padding=1, stride=strides)
        self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3,padding=1)
        
        #F_k
        if use_1x1conv:
            #对输入使用1*1卷积层再和输出合并
            self.conv3 = nn.Conv2d(input_channels, num_channels,kernel_size=1, stride=strides)
        else:
            #不使用
            self.conv3 = None

        #对F_{k+1}的两个卷积层使用batchnorm   
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)

        #再对合并的输出使用relu激活函数
        self.relu = nn.ReLU(inplace=True)
    
    #实现残差块的逻辑
    def forward(self, X):

        Y = F.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))

        if self.conv3:
            X = self.conv3(X)
        
        #合并
        Y += X

        return F.relu(Y)

# 输入和输出形状一致
blk = Residual(3, 3)
X = torch.rand(4, 3, 6, 6)
Y = blk(X)
print(Y.shape)

# 增加输出通道数的同时，减半输出的高和宽
blk = Residual(3, 6, use_1x1conv=True, strides=2)
print(blk(X).shape)

# ResNet模型

#satge1
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                                nn.BatchNorm2d(64), nn.ReLU(),
                                nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

#stage生成函数 num_residuals->stage的个数
def resnet_block(input_channels, num_channels, num_residuals,first_block=False):
    blk = []
    for i in range(num_residuals):
        #对于不是第一个使用ResNet块（b3 b4 b5)，进行高宽减半；因为b1已经将高宽减小了，所以b2没有必要再次进行减半操作
        if i == 0 and not first_block:
            blk.append(Residual(input_channels, num_channels, use_1x1conv=True,strides=2))
        else:
            blk.append(Residual(num_channels, num_channels))
    return blk

# stage2-5使用ResNet块
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))

#神经网络
net = nn.Sequential(b1, b2, b3, b4, b5, nn.AdaptiveAvgPool2d((1, 1)),nn.Flatten(), nn.Linear(512, 10))

#ResNet不同块的输出
X = torch.rand(size=(1, 1, 224, 224))
for layer in net:
    X = layer(X)
    print(layer.__class__.__name__, 'output shape:\t', X.shape)

#训练
lr, num_epochs, batch_size = 0.05, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

DenseNet

1. DensetNet是ResNet的拓展。

2. ResNet和DenseNet的关键区别在于，DenseNet输出是连接（⽤图中的[, ]表⽰）⽽不是如ResNet的简单相加。因此，在应⽤越来越复杂的函数序列后，我们执⾏从x到其展开式的映射：

x → [x, f1(x), f2([x, f1(x)]), f3([x, f1(x), f2([x,f1(x)])]), . . .] .

3. 稠密⽹络主要由2部分构成：稠密块（dense block）和过渡层（transition layer）。前者定义如何连接输⼊和输出，⽽后者则控制通道数量，使其不会太复杂。

代码

稠密块体

import torch
from torch import nn
from d2l import torch as d2l

#定义卷积块
def conv_block(input_channels, num_channels):

    return nn.Sequential(nn.BatchNorm2d(input_channels), 
                                     nn.ReLU(),
                                     nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1))

#定义稠密块：⼀个稠密块由多个卷积块组成，每个卷积块使⽤相同数量的输出通道。
#最终输出通道数=input_channels + num_convs*num_channels
class DenseBlock(nn.Module):
    def __init__(self, num_convs, input_channels, num_channels):
        super(DenseBlock, self).__init__()
        layer = []
        for i in range(num_convs):
            layer.append(conv_block(num_channels * i + input_channels, num_channels))
        self.net = nn.Sequential(*layer)

    #在前向传播中，我们将每个卷积块的输⼊和输出在通道维上连结。
    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            # 连接通道维度上每个块的输⼊和输出
            X = torch.cat((X, Y), dim=1)
        return X

#调用    
blk = DenseBlock(2, 3, 10)
X = torch.randn(4, 3, 8, 8)
Y = blk(X)
Y.shape

过渡层：它通过1 × 1卷积层来减⼩通道数，并使⽤步幅为2的平均汇聚层减半⾼和宽，从⽽进⼀步降低模型复杂度。

def transition_block(input_channels, num_channels):
    return nn.Sequential(
        nn.BatchNorm2d(input_channels), nn.ReLU(),
        nn.Conv2d(input_channels, num_channels, kernel_size=1),
        nn.AvgPool2d(kernel_size=2, stride=2))

blk = transition_block(23, 10)
blk(Y).shape

DenseNet模型

b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                   nn.BatchNorm2d(64), nn.ReLU(),
                   nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

# num_channels为当前的通道数
num_channels, growth_rate = 64, 32
num_convs_in_dense_blocks = [4, 4, 4, 4]

#DenseNet块
blks = []

#生成DenseNet块
for i, num_convs in enumerate(num_convs_in_dense_blocks):
    blks.append(DenseBlock(num_convs, num_channels, growth_rate))

    # 上⼀个稠密块的输出通道数
    num_channels += num_convs * growth_rate

    # 在稠密块之间添加⼀个转换层，使通道数量减半
    if i != len(num_convs_in_dense_blocks) - 1:
        blks.append(transition_block(num_channels, num_channels // 2))
        num_channels = num_channels // 2

#DenseNet模型
net = nn.Sequential(b1, *blks,
                    nn.BatchNorm2d(num_channels), nn.ReLU(),
                    nn.AdaptiveAvgPool2d((1, 1)),
                    nn.Flatten(),
                    nn.Linear(num_channels, 10))

#训练
lr, num_epochs, batch_size = 0.1, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=96)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())