《从零构建大模型》系列（21）：从头实现GPT模型——构建文本生成引擎

# 未归一化的激活值问题
activations = torch.randn(1000, 768) * 10  # 模拟大方差激活
mean = activations.mean(dim=1)  # 各样本均值差异大
std = activations.std(dim=1)   # 各样本标准差差异大

print("均值范围:", mean.min().item(), "~", mean.max().item())
print("标准差范围:", std.min().item(), "~", std.max().item())

层归一化优势：

稳定训练过程
加速收敛速度
缓解梯度消失/爆炸问题
减少对初始化的依赖

2.2 层归一化实现代码

class LayerNorm(nn.Module):
    def __init__(self, d_model, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = nn.Parameter(torch.ones(d_model))  # 缩放参数
        self.beta = nn.Parameter(torch.zeros(d_model))  # 平移参数
    
    def forward(self, x):
        # 计算均值和方差
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True, unbiased=False)
        
        # 归一化
        x_normalized = (x - mean) / (std + self.eps)
        
        # 缩放和平移
        return self.gamma * x_normalized + self.beta

# 与PyTorch官方实现对比测试
def test_layernorm():
    input_tensor = torch.randn(2, 3, 768)
    
    # 自定义层归一化
    custom_ln = LayerNorm(768)
    custom_out = custom_ln(input_tensor)
    
    # PyTorch官方层归一化
    official_ln = nn.LayerNorm(768)
    official_out = official_ln(input_tensor)
    
    # 检查差异
    diff = (custom_out - official_out).abs().max().item()
    print(f"最大差异: {diff:.6f}")  # 应小于1e-5

test_layernorm()

三、前馈神经网络实现

3.1 GPT中的前馈结构

3.2 GELU激活函数

class GELU(nn.Module):
    """高斯误差线性单元激活函数"""
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(
            math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))
        )

3.3 完整前馈网络实现

class FeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),  # 扩展维度
            GELU(),  # 使用自定义GELU
            nn.Linear(4 * d_model, d_model),  # 降回原维度
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        return self.net(x)