理解GPT核心架构
GPT基于Transformer解码器结构,核心是多头自注意力机制和前馈神经网络。简化版需保留以下关键组件:自注意力层、层归一化、残差连接及位置编码。
准备训练数据
选择适当规模的文本数据集(如WikiText-2)。需实现Tokenizer将文本转换为词元ID序列,建议使用Byte Pair Encoding(BPE)算法,或直接调用HuggingFace的现成Tokenizer。
实现基础组件
import torch
import torch.nn as nn
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.d_head = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, x):
# 实现缩放点积注意力,带mask防止未来信息泄露
pass
class FeedForward(nn.Module):
def __init__(self, d_model):
super().__init__()
self.net = nn.Sequential(
nn.Linear(d_model, 4*d_model),
nn.GELU(),
nn.Linear(4*d_model, d_model)
)
构建Transformer Block
class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.attn = MultiHeadAttention(d_model, num_heads)
self.ffn = FeedForward(d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
def forward(self, x):
x = x + self.attn(self.norm1(x))
x = x + self.ffn(self.norm2(x))
return x
实现位置编码
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=512):
super().__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe)
整合完整模型
class MiniGPT(nn.Module):
def __init__(self, vocab_size, d_model=256, num_layers=4, num_heads=8):
super().__init__()
self.token_emb = nn.Embedding(vocab_size, d_model)
self.pos_enc = PositionalEncoding(d_model)
self.layers = nn.ModuleList([TransformerBlock(d_model, num_heads) for _ in range(num_layers)])
self.lm_head = nn.Linear(d_model, vocab_size)
def forward(self, x):
x = self.token_emb(x) + self.pos_enc(x)
for layer in self.layers:
x = layer(x)
return self.lm_head(x)
训练流程设置
使用交叉熵损失函数,采用AdamW优化器。关键训练参数示例:
- 批大小:32
- 学习率:3e-4
- 上下文长度:256
- Dropout率:0.1
model = MiniGPT(vocab_size=50000)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()
for batch in dataloader:
inputs, targets = batch
outputs = model(inputs)
loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
loss.backward()
optimizer.step()
optimizer.zero_grad()
推理实现
实现温度采样(temperature sampling)生成文本:
def generate(model, prompt, max_len=50, temperature=0.7):
model.eval()
tokens = tokenizer.encode(prompt)
for _ in range(max_len):
logits = model(torch.tensor([tokens[-256:]]))[:,-1,:]
probs = torch.softmax(logits/temperature, dim=-1)
next_token = torch.multinomial(probs, 1).item()
tokens.append(next_token)
return tokenizer.decode(tokens)
性能优化技巧
- 混合精度训练:使用
torch.cuda.amp加速计算 - 梯度裁剪:防止梯度爆炸
- 学习率调度:如余弦退火
- 检查点保存:定期保存模型状态
实际部署时建议从现成库(如HuggingFace Transformers)开始,但上述实现有助于理解底层机制。完整实现需约200-300行Python代码,可在单张消费级GPU上训练小型模型。
手把手实现简化版GPT
1620

被折叠的 条评论
为什么被折叠?



