
我将根据流程图用代码来讲解我自己的理解。
Step1
为了将图像转换为可作为Vit网络的输入,我们首先要对图像进行分块即Patch imbedding。使用为维度为[1,3,32,32]的图片作为例子,即batch_size=1,channels=3,height=weight=32.
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
如上图是整个Vit代码的一部分,为了对图像进行分块,我们首先要获得图像的height和weight,同时我们也要给出一个块的size即p_height和p_weight.然后可以计算得到块的数量。假设我们的patch_size=4,即p_height=p_weight=4.则我们计算得到的块的数量就是32*32/4*4=64,为了使这些块可以作为Vit的输入,我们还需要使其经过一个线性层,这个线性层的输入维度是每一个块的所有特征元素的数量即patch_dim=channels*p_height*p_height,输出维度是我们自己定义的dim,这里我将dim=64.经过这些操作后,我们就将起初的输入维度从[1,3,32,32]转换为了[1,64,64]即[batch_size,token_num,feature_num]。现在这个数据就可以作为我们的Token了。
Stept2
完成了对图像的分块和线性变换我们还要在原来Token的基础上加上cls向量作为最后分类的依据用论文中的话就是image representation。
同时我们还要加上可学习的位置编码向量。
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, img):
x = self.to_patch_embedding(img)
print(f'x1:{x.shape}')
b, n, _ = x.shape #[batch_size,token_num,feature_num]
cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b) #[batch_size,1,feature_num]
x = torch.cat((cls_tokens, x), dim=1)
x += self.pos_embedding[:, :(n + 1)]
print(self.pos_embedding[:,:(n+1)].shape) #[batch_size,patch_num,feature_num]
经过这一步操作后,我们的数据维度就由原来的[1,64,64]变为了[1,65,64]即我们的数据batch_size为1,一个样本有65个Token每个Token有64维的特征。
step3
数据已经做好准备了,接下来我们就可以直接把数据输入到Transformer中并得到输出。
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, img):
x = self.to_patch_embedding(img)
print(f'x1:{x.shape}')
b, n, _ = x.shape #[batch_size,token_num,feature_num]
cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b) #[batch_size,1,feature_num]
x = torch.cat((cls_tokens, x), dim=1)
x += self.pos_embedding[:, :(n + 1)]
print(self.pos_embedding[:,:(n+1)].shape) #[batch_size,patch_num,feature_num]
x = self.dropout(x)
x = self.transformer(x)
print(f'x:{x.shape}')
经过Transformer后我们的数据维度并不会改变,仍然是[1,65,64]
Step4
得到Transformer的输出后,我们们有两种放法去获取作为我们分类的依据,一种就是直接获取我们插入的cls向量直接作为我们分类的依据,另一种就是对所有的特征向量求平均值。
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, img):
x = self.to_patch_embedding(img)
print(f'x1:{x.shape}')
b, n, _ = x.shape #[batch_size,token_num,feature_num]
cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b) #[batch_size,1,feature_num]
x = torch.cat((cls_tokens, x), dim=1)
x += self.pos_embedding[:, :(n + 1)]
print(self.pos_embedding[:,:(n+1)].shape) #[batch_size,patch_num,feature_num]
x = self.dropout(x)
x = self.transformer(x)
print(f'x_output:{x.shape}')
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
print(f'feature:{x.shape}')
这里我用的是第一种方法,最后我们的数据维度变为了[1,64],即一个样本的分类依据,
Step5
最后我们将分类依据经过一个线性层,得到一个长度为类别个数的一维向量。
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, img):
x = self.to_patch_embedding(img)
print(f'x1:{x.shape}')
b, n, _ = x.shape #[batch_size,token_num,feature_num]
cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b) #[batch_size,1,feature_num]
x = torch.cat((cls_tokens, x), dim=1)
x += self.pos_embedding[:, :(n + 1)]
print(self.pos_embedding[:,:(n+1)].shape) #[batch_size,patch_num,feature_num]
x = self.dropout(x)
x = self.transformer(x)
print(f'x_output:{x.shape}')
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
print(f'feature:{x.shape}')
x = self.to_latent(x)
print(f'latent_x{x.shape}')
return self.mlp_head(x)
Transformer
由上图可知,Transformer主要由多头注意力机制和一个前馈层构成。并且每一层都用了残差连接。首先给大家看前馈层的代码:
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
该前馈层只有1个隐藏层,接受dim维的输入,然后经过一个线性变换映射到hidden_dim维,然后再经过一个线性变换映射到dim维,经过第一个线性变换后使用了GELU函数作为激活函数。
然后就是多头注意力机制:
class Attention(nn.Module):
def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
super().__init__()
inner_dim = dim_head * heads
project_out = not (heads == 1 and dim_head == dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.attend = nn.Softmax(dim = -1)
self.dropout = nn.Dropout(dropout)
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
def forward(self, x):
qkv = self.to_qkv(x).chunk(3, dim = -1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
# print(f'v:{v.shape}')
dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
# print(f'dot:{dots.shape}')
attn = self.attend(dots)
# print(f'atten{attn.shape}')
attn = self.dropout(attn)
out = torch.matmul(attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')
# print(f'out:{out.shape}')
return self.to_out(out)
代码中给出的注意力头有八个即把每一个q,k都再加上可学习的参数,使得q和k分别有8个,并且每个q,k,v的特征维度都是dim_head=64。
下面是完整的Vit代码
import torch
from torch import nn
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
def pair(t):
return t if isinstance(t, tuple) else (t, t)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = nn.LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
class Linear():
def __init__(self):
self.nor=nn.LayerNorm(28)
self.layer1=nn.Linear(28,14)
def forward(self,x):
return self.layer1(self.nor(x))
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class Attention(nn.Module):
def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
super().__init__()
inner_dim = dim_head * heads
project_out = not (heads == 1 and dim_head == dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.attend = nn.Softmax(dim = -1)
self.dropout = nn.Dropout(dropout)
self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
self.to_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
) if project_out else nn.Identity()
def forward(self, x):
qkv = self.to_qkv(x).chunk(3, dim = -1)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)
# print(f'v:{v.shape}')
dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
# print(f'dot:{dots.shape}')
attn = self.attend(dots)
# print(f'atten{attn.shape}')
attn = self.dropout(attn)
out = torch.matmul(attn, v)
out = rearrange(out, 'b h n d -> b n (h d)')
# print(f'out:{out.shape}')
return self.to_out(out)
class Transformer(nn.Module):
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout = 0.):
super().__init__()
self.layers = nn.ModuleList([])
for _ in range(depth):
self.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
]))
def forward(self, x):
for attn, ff in self.layers:
x = attn(x) + x
x = ff(x) + x
return x
class ViT(nn.Module):
def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool = 'cls', channels = 3, dim_head = 64, dropout = 0., emb_dropout = 0.):
super().__init__()
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'
num_patches = (image_height // patch_height) * (image_width // patch_width)
patch_dim = channels * patch_height * patch_width
assert pool in {'cls', 'mean'}, 'pool type must be either cls (cls token) or mean (mean pooling)'
self.to_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, dim),
nn.LayerNorm(dim),
)
self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
self.dropout = nn.Dropout(emb_dropout)
self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)
self.pool = pool
self.to_latent = nn.Identity()
self.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_classes)
)
def forward(self, img):
x = self.to_patch_embedding(img)
print(f'x1:{x.shape}')
b, n, _ = x.shape #[batch_size,token_num,feature_num]
cls_tokens = repeat(self.cls_token, '1 1 d -> b 1 d', b = b) #[batch_size,1,feature_num]
x = torch.cat((cls_tokens, x), dim=1)
x += self.pos_embedding[:, :(n + 1)]
print(self.pos_embedding[:,:(n+1)].shape) #[batch_size,patch_num,feature_num]
x = self.dropout(x)
x = self.transformer(x)
print(f'x_output:{x.shape}')
x = x.mean(dim = 1) if self.pool == 'mean' else x[:, 0]
print(f'feature:{x.shape}')
x = self.to_latent(x)
print(f'latent_x{x.shape}')
return self.mlp_head(x)
x=torch.randn(1,3,32,32)
test=ViT(image_size=32,patch_size=4,num_classes=100,dim=64,depth=3,heads=8,mlp_dim=56)
output=test.forward(x)
print(output.shape)