Tied Embeddings(权重共享技术)在Transformer架构中广泛应用,尤其是在语言模型中,它通过共享输入嵌入层和输出层的权重来减少模型参数量,优化训练效率。
使用Tied Embeddings:
#使用Tied Embeddings:
import torch
import torch.nn as nn
import torch.nn.functional as F
class SharedEmbeddingTransformer(nn.Module):
def __init__(self, num_tokens, embed_dim, num_layers=2):
super().__init__()
self.token_embedding = nn.Embedding(num_tokens, embed_dim)
self.transformer_encoder_layers = nn.TransformerEncoderLayer(
d_model=embed_dim, nhead=8, dim_feedforward=2048
)
self.transformer_encoder = nn.TransformerEncoder(
self.transformer_encoder_layers, num_layers=num_layers
)
self.output_layer = nn.Linear(embed_dim, num_tokens, bias=False)
# 共享 Embedding 层和输出层的权重
self.output_layer.weight = self.token_embedding.weight
# 打印权重
print("self.token_embedding.weight:", self.token_embedding.weight)
print("self.output_layer.weight:", self.output_layer.weight)
def forward(self, input_indices):
x = self.token_embedding(input_indices) # 词嵌入
x = x.transpose(0, 1) # Transformer 需要 (seq_len, batch_size, embed_dim)
x = self.transformer_encoder(x) # 经过 Transformer 编码器
x = x.transpose(0, 1) # 转换回 (batch_size, seq_len, embed_dim)
logits = self.output_layer(x) # 输出层,自动共享 Embedding 权重
return logits
# 创建模型
vocabulary_size = 100000 # 词汇量
embedding_dimension = 512 # 嵌入维度
num_layers = 4 # Transformer 层数
model = SharedEmbeddingTransformer(vocabulary_size, embedding_dimension, num_layers)
# 计算参数量
total_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"模型的总参数量: {total_parameters}")
# 输入测试
batch_size = 5
sequence_length = 20
input_ids = torch.randint(0, vocabulary_size, (batch_size, sequence_length)) # Batch=5, 长度=20
output = model(input_ids)
print(output.shape) # 预期输出: (1, 10, 50000)
输出:
不使用Tied Embeddings
#不使用Tied Embeddings
import torch
import torch.nn as nn
import torch.nn.functional as F
class SharedEmbeddingTransformer(nn.Module):
def __init__(self, num_tokens, embed_dim, num_layers=2):
super().__init__()
self.token_embedding = nn.Embedding(num_tokens, embed_dim)
self.transformer_encoder_layers = nn.TransformerEncoderLayer(
d_model=embed_dim, nhead=8, dim_feedforward=2048
)
self.transformer_encoder = nn.TransformerEncoder(
self.transformer_encoder_layers, num_layers=num_layers
)
self.output_layer = nn.Linear(embed_dim, num_tokens, bias=False)
# 共享 Embedding 层和输出层的权重
# self.output_layer.weight = self.token_embedding.weight
# 打印权重
print("self.token_embedding.weight:", self.token_embedding.weight)
print("self.output_layer.weight:", self.output_layer.weight)
def forward(self, input_indices):
x = self.token_embedding(input_indices) # 词嵌入
x = x.transpose(0, 1) # Transformer 需要 (seq_len, batch_size, embed_dim)
x = self.transformer_encoder(x) # 经过 Transformer 编码器
x = x.transpose(0, 1) # 转换回 (batch_size, seq_len, embed_dim)
logits = self.output_layer(x) # 输出层,自动共享 Embedding 权重
return logits
# 创建模型
vocabulary_size = 100000 # 词汇量
embedding_dimension = 512 # 嵌入维度
num_layers = 4 # Transformer 层数
model = SharedEmbeddingTransformer(vocabulary_size, embedding_dimension, num_layers)
# 计算参数量
total_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"模型的总参数量: {total_parameters}")
# 输入测试
batch_size = 5
sequence_length = 20
input_ids = torch.randint(0, vocabulary_size, (batch_size, sequence_length)) # Batch=5, 长度=20
output = model(input_ids)
print(output.shape) # 预期输出: (1, 10, 50000)
输出:
总结
可见使用Tied Embeddings后,模型参数量少了一半。不过,该技术仍有利弊:
- 在训练初期,Tied Embeddings可能导致输入嵌入层的梯度过小,影响训练速度;
- 由于输入和输出层共享权重,优化过程中可能会出现梯度冲突,影响模型的收敛;
- Tied Embeddings假设输入和输出的嵌入表示可以共享同一组权重,但实际上,输入嵌入和输出嵌入可能需要不同的特征表示。例如,输入嵌入可能更关注上下文信息,而输出嵌入可能需要更精确的词汇预测能力;
- 建议先使用tied embedding,收敛一段时候之后使用untied embedding,既不损失模型的capacity,又能够加速word embedding层训练。(来自知乎:https://zhuanlan.zhihu.com/p/667504988)