Vision Transformer

Vit网络架构图

 我将根据流程图用代码来讲解我自己的理解。

Step1

为了将图像转换为可作为Vit网络的输入,我们首先要对图像进行分块即Patch imbedding。使用为维度为[1,3,32,32]的图片作为例子,即batch_size=1,channels=3,height=weight=32.

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

如上图是整个Vit代码的一部分,为了对图像进行分块,我们首先要获得图像的height和weight,同时我们也要给出一个块的size即p_height和p_weight.然后可以计算得到块的数量。假设我们的patch_size=4,即p_height=p_weight=4.则我们计算得到的块的数量就是32*32/4*4=64,为了使这些块可以作为Vit的输入,我们还需要使其经过一个线性层,这个线性层的输入维度是每一个块的所有特征元素的数量即patch_dim=channels*p_height*p_height,输出维度是我们自己定义的dim,这里我将dim=64.经过这些操作后,我们就将起初的输入维度从[1,3,32,32]转换为了[1,64,64]即[batch_size,token_num,feature_num]。现在这个数据就可以作为我们的Token了。

Stept2

完成了对图像的分块和线性变换我们还要在原来Token的基础上加上cls向量作为最后分类的依据用论文中的话就是image representation。

同时我们还要加上可学习的位置编码向量。

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        print(f'x1:{x.shape}')

        b, n, _ = x.shape   #[batch_size,token_num,feature_num]

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)  #[batch_size,1,feature_num]

        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        print(self.pos_embedding[:,:(n+1)].shape)    #[batch_size,patch_num,feature_num]

经过这一步操作后,我们的数据维度就由原来的[1,64,64]变为了[1,65,64]即我们的数据batch_size为1,一个样本有65个Token每个Token有64维的特征。

step3

数据已经做好准备了,接下来我们就可以直接把数据输入到Transformer中并得到输出。

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        print(f'x1:{x.shape}')

        b, n, _ = x.shape   #[batch_size,token_num,feature_num]

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)  #[batch_size,1,feature_num]

        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        print(self.pos_embedding[:,:(n+1)].shape)    #[batch_size,patch_num,feature_num]

        x = self.dropout(x)

        x = self.transformer(x)
        print(f'x:{x.shape}')

经过Transformer后我们的数据维度并不会改变,仍然是[1,65,64]

Step4

得到Transformer的输出后,我们们有两种放法去获取作为我们分类的依据,一种就是直接获取我们插入的cls向量直接作为我们分类的依据,另一种就是对所有的特征向量求平均值。

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        print(f'x1:{x.shape}')

        b, n, _ = x.shape   #[batch_size,token_num,feature_num]

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)  #[batch_size,1,feature_num]

        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        print(self.pos_embedding[:,:(n+1)].shape)    #[batch_size,patch_num,feature_num]

        x = self.dropout(x)

        x = self.transformer(x)
        print(f'x_output:{x.shape}')
        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
        print(f'feature:{x.shape}')

这里我用的是第一种方法,最后我们的数据维度变为了[1,64],即一个样本的分类依据,

Step5

最后我们将分类依据经过一个线性层,得到一个长度为类别个数的一维向量。

class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        print(f'x1:{x.shape}')

        b, n, _ = x.shape   #[batch_size,token_num,feature_num]

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)  #[batch_size,1,feature_num]

        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        print(self.pos_embedding[:,:(n+1)].shape)    #[batch_size,patch_num,feature_num]

        x = self.dropout(x)

        x = self.transformer(x)
        print(f'x_output:{x.shape}')
        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
        print(f'feature:{x.shape}')
        x = self.to_latent(x)
        print(f'latent_x{x.shape}')
        return self.mlp_head(x)

Transformer

由上图可知,Transformer主要由多头注意力机制和一个前馈层构成。并且每一层都用了残差连接。首先给大家看前馈层的代码:

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)

该前馈层只有1个隐藏层,接受dim维的输入,然后经过一个线性变换映射到hidden_dim维,然后再经过一个线性变换映射到dim维,经过第一个线性变换后使用了GELU函数作为激活函数。

然后就是多头注意力机制:

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
        # print(f'v:{v.shape}')
        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        # print(f'dot:{dots.shape}')
        attn = self.attend(dots)
        # print(f'atten{attn.shape}')
        attn = self.dropout(attn)
        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        # print(f'out:{out.shape}')
        return self.to_out(out)

代码中给出的注意力头有八个即把每一个q,k都再加上可学习的参数,使得q和k分别有8个,并且每个q,k,v的特征维度都是dim_head=64。

下面是完整的Vit代码

import torch
from torch import nn

from einops import rearrange, repeat
from einops.layers.torch import Rearrange

def pair(t):
    return t if isinstance(t, tuple) else (t, t)
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn
    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)
class Linear():
    def __init__(self):
        self.nor=nn.LayerNorm(28)
        self.layer1=nn.Linear(28,14)
    def forward(self,x):
        return self.layer1(self.nor(x))
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout = 0.):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, dim),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.net(x)
class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
        super().__init__()
        inner_dim = dim_head *  heads
        project_out = not (heads == 1 and dim_head == dim)

        self.heads = heads
        self.scale = dim_head ** -0.5

        self.attend = nn.Softmax(dim = -1)
        self.dropout = nn.Dropout(dropout)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)

        self.to_out = nn.Sequential(
            nn.Linear(inner_dim, dim),
            nn.Dropout(dropout)
        ) if project_out else nn.Identity()

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
        # print(f'v:{v.shape}')
        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        # print(f'dot:{dots.shape}')
        attn = self.attend(dots)
        # print(f'atten{attn.shape}')
        attn = self.dropout(attn)
        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        # print(f'out:{out.shape}')
        return self.to_out(out)
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
            ]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x
class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.pool = pool
        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        print(f'x1:{x.shape}')

        b, n, _ = x.shape   #[batch_size,token_num,feature_num]

        cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b)  #[batch_size,1,feature_num]

        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        print(self.pos_embedding[:,:(n+1)].shape)    #[batch_size,patch_num,feature_num]

        x = self.dropout(x)

        x = self.transformer(x)
        print(f'x_output:{x.shape}')
        x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
        print(f'feature:{x.shape}')
        x = self.to_latent(x)
        print(f'latent_x{x.shape}')
        return self.mlp_head(x)
x=torch.randn(1,3,32,32)
test=ViT(image_size=32,patch_size=4,num_classes=100,dim=64,depth=3,heads=8,mlp_dim=56)
output=test.forward(x)
print(output.shape)

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值