又是和sb抗争的一天。
1.autoencoder
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class Autoencoder(nn.Module):
def __init__(self, input_dim, latent_dim):
super(Autoencoder, self).__init__()
# 编码器
self.encoder = nn.Sequential(
nn.Linear(input_dim, 128),
nn.ReLU(),
nn.Linear(128, latent_dim)
)
# 解码器
self.decoder = nn.Sequential(
nn.Linear(latent_dim, 128),
nn.ReLU(),
nn.Linear(128, input_dim),
nn.Sigmoid() # Sigmoid激活函数用于将输出限制在0到1之间
)
def forward(self, x):
latent = self.encoder(x)
reconstructed = self.decoder(latent)
return reconstructed
# 示例
input_dim = 784 # 比如是28x28的图像展平为一维向量
latent_dim = 64 # 潜在空间维度
model = Autoencoder(input_dim, latent_dim)
# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练过程(示例代码)
num_epochs = 20
for epoch in range(num_epochs):
model.train()
# 假设data是一个输入数据的batch
data = torch.randn(32, input_dim) # 假数据
optimizer.zero_grad()
reconstructed = model(data)
loss = criterion(reconstructed, data) # 计算重构损失
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
代码很清楚,就是把数据做了一个线性变换,投影到一个高位空间。然后从高位再还原回去,计算还原的数据和原始输入的差距。个人感觉和降维的意思一样。至于怎么检测是否有异常,则通过比较输入和输出的差值是不是大于之前训练的模型的loss。
2.VAE
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class VAE(nn.Module):
def __init__(self, input_dim, latent_dim):
super(VAE, self).__init__()
# 编码器
self.encoder_fc1 = nn.Linear(input_dim, 512)
self.encoder_fc2 = nn.Linear(512, latent_dim)
self.encoder_fc3 = nn.Linear(512, latent_dim)
# 解码器
self.decoder_fc1 = nn.Linear(latent_dim, 512)
self.decoder_fc2 = nn.Linear(512, input_dim)
def encode(self, x):
h1 = F.relu(self.encoder_fc1(x))
return self.encoder_fc2(h1), self.encoder_fc3(h1) # 输出均值和方差
def reparameterize(self, mu, logvar):
std = torch.exp(0.5*logvar) # 计算标准差
eps = torch.randn_like(std) # 随机噪声
return mu + eps * std # 通过重新参数化技巧采样潜在变量
def decode(self, z):
h2 = F.relu(self.decoder_fc1(z))
return torch.sigmoid(self.decoder_fc2(h2)) # 使用sigmoid激活函数保证输出范围在0-1之间
def forward(self, x):
mu, logvar = self.encode(x.view(-1, 784)) # 这里假设输入是28x28的图像展平为784维
z = self.reparameterize(mu, logvar)
return self.decode(z), mu, logvar
def loss_function(self, recon_x, x, mu, logvar):
# 重构损失
BCE = F.binary_cross_entropy(recon_x, x.view(-1, 784), reduction='sum')
# KL 散度损失
# 公式:D_KL(q(z|x) || p(z)) = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
# 这里的mu是均值,logvar是对数方差
# p(z)是标准正态分布
# q(z|x)是编码器输出的潜在变量分布
# 这是变分自编码器的KL散度部分
# 注意,我们不需要对样本数进行归一化(sum)处理
# 如果你使用的是标准正态分布 prior,那么 logvar 就是对数方差
# 通过KL散度来约束潜在空间的分布
# KL散度计算
# 假设潜在变量的分布为N(mu, sigma^2),
# 对于标准正态分布N(0, 1)的KL散度为:
# KL(N(mu, sigma^2) || N(0, 1)) = 0.5 * (mu^2 + sigma^2 - log(sigma^2) - 1)
# 其中logvar = log(sigma^2)
# KL 散度损失部分
# logvar 变成方差,mu直接使用
# 公式:KL散度 = 0.5 * (mu^2 + exp(logvar) - logvar - 1)
# 其中logvar 是对数方差
# 计算KL散度损失
# KLD的实现
# mu是均值,logvar是对数方差
# exp(logvar)是标准差的平方
# logvar - 1是为了减去对数方差的影响
# 用kl_loss来控制编码器的输出
# kl_loss通常希望潜在空间接近标准正态分布
# 通过惩罚偏离标准正态分布的样本
KL = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KL
# 示例
input_dim = 784 # 比如是28x28的图像展平为一维向量
latent_dim = 64 # 潜在空间维度
model = VAE(input_dim, latent_dim)
# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 训练过程(示例代码)
num_epochs = 20
for epoch in range(num_epochs):
model.train()
# 假设data是一个输入数据的batch
data = torch.randn(32, input_dim) # 假数据
optimizer.zero_grad()
reconstructed, mu, logvar = model(data)
loss = model.loss_function(reconstructed, data, mu, logvar)
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
之前的autoencoder的意思可以理解成, 你输入一个x,通过encode 得到一个y,encode就是f(x) y=f(x),deocde就是求反函数,x’=f’(y) 得到x’. 所以整个流程就是
x-> y=f(x)->x=f’(y)
那么VAE是啥呢,他默认数据服从正态分布。所以你的输入x 通过encode可以得到 x输入的均值和方差。那么这个数据均值和方差和正态分布的均值和方差的差距就是计算KL,也就是计算概率分布的差距。 通过均值和方差得到数据的概率分布后,可以生成一个高纬随机数z。 通过这个z反过来计算出x
所以流程就是x->mean(x),std(x)->z(mean(x)+随机数*std)->x’=f(z)
3.Autoformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from layers.Embed import DataEmbedding, DataEmbedding_wo_pos,DataEmbedding_wo_pos_temp,DataEmbedding_wo_temp
from layers.AutoCorrelation import AutoCorrelation, AutoCorrelationLayer
from layers.Autoformer_EncDec import Encoder, Decoder, EncoderLayer, DecoderLayer, my_Layernorm, series_decomp
import math
import numpy as np
class Model(nn.Module):
"""
Autoformer is the first method to achieve the series-wise connection,
with inherent O(LlogL) complexity
"""
def __init__(self, configs):
super(Model, self).__init__()
self.seq_len = configs.seq_len
self.label_len = configs.label_len
self.pred_len = configs.pred_len
self.output_attention = configs.output_attention
# Decomp
kernel_size = configs.moving_avg
self.decomp = series_decomp(kernel_size)
# Embedding
# The series-wise connection inherently contains the sequential information.
# Thus, we can discard the position embedding of transformers.
if configs.embed_type == 0:
self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
elif configs.embed_type == 1:
self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
elif configs.embed_type == 2:
self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
elif configs.embed_type == 3:
self.enc_embedding = DataEmbedding_wo_temp(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding_wo_temp(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
elif configs.embed_type == 4:
self.enc_embedding = DataEmbedding_wo_pos_temp(configs.enc_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
self.dec_embedding = DataEmbedding_wo_pos_temp(configs.dec_in, configs.d_model, configs.embed, configs.freq,
configs.dropout)
# Encoder
self.encoder = Encoder(
[
EncoderLayer(
AutoCorrelationLayer(
AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout,
output_attention=configs.output_attention),
configs.d_model, configs.n_heads),
configs.d_model,
configs.d_ff,
moving_avg=configs.moving_avg,
dropout=configs.dropout,
activation=configs.activation
) for l in range(configs.e_layers)
],
norm_layer=my_Layernorm(configs.d_model)
)
# Decoder
self.decoder = Decoder(
[
DecoderLayer(
AutoCorrelationLayer(
AutoCorrelation(True, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
AutoCorrelationLayer(
AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout,
output_attention=False),
configs.d_model, configs.n_heads),
configs.d_model,
configs.c_out,
configs.d_ff,
moving_avg=configs.moving_avg,
dropout=configs.dropout,
activation=configs.activation,
)
for l in range(configs.d_layers)
],
norm_layer=my_Layernorm(configs.d_model),
projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
)
def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
# decomp init
mean = torch.mean(x_enc, dim=1).unsqueeze(1).repeat(1, self.pred_len, 1)
zeros = torch.zeros([x_dec.shape[0], self.pred_len, x_dec.shape[2]], device=x_enc.device)
seasonal_init, trend_init = self.decomp(x_enc)
# decoder input
trend_init = torch.cat([trend_init[:, -self.label_len:, :], mean], dim=1)
seasonal_init = torch.cat([seasonal_init[:, -self.label_len:, :], zeros], dim=1)
# enc
enc_out = self.enc_embedding(x_enc, x_mark_enc)
enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
# dec
dec_out = self.dec_embedding(seasonal_init, x_mark_dec)
seasonal_part, trend_part = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask,
trend=trend_init)
# final
dec_out = trend_part + seasonal_part
if self.output_attention:
return dec_out[:, -self.pred_len:, :], attns
else:
return dec_out[:, -self.pred_len:, :] # [B, L, D]
3.1 Transformer
https://github.com/Zeying-Gong/PatchMixer/blob/main/models/Transformer.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
# Positional Encoding for Injecting Sequence Order Information
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
return x + self.pe[:, :x.size(1)]
# Multi-Head Attention
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, n_heads):
super(MultiHeadAttention, self).__init__()
self.d_model = d_model
self.n_heads = n_heads
self.d_k = d_model // n_heads
# Linear projections of the queries, keys, and values
self.query_linear = nn.Linear(d_model, d_model)
self.key_linear = nn.Linear(d_model, d_model)
self.value_linear = nn.Linear(d_model, d_model)
self.out_linear = nn.Linear(d_model, d_model)
def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
# Linear projections
Q = self.query_linear(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
K = self.key_linear(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
V = self.value_linear(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
# Scaled Dot-Product Attention
scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention = torch.softmax(scores, dim=-1)
output = torch.matmul(attention, V).transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
output = self.out_linear(output)
return output
# Positionwise FeedForward Network
class PositionwiseFeedforward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.dropout(x)
x = self.fc2(x)
return x
# Transformer Encoder Layer
class EncoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
super(EncoderLayer, self).__init__()
self.self_attention = MultiHeadAttention(d_model, n_heads)
self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
self.layer_norm1 = nn.LayerNorm(d_model)
self.layer_norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
attn_output = self.self_attention(x, x, x, mask)
x = self.layer_norm1(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.layer_norm2(x + self.dropout(ff_output))
return x
# Transformer Decoder Layer
class DecoderLayer(nn.Module):
def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
super(DecoderLayer, self).__init__()
self.self_attention = MultiHeadAttention(d_model, n_heads)
self.cross_attention = MultiHeadAttention(d_model, n_heads)
self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
self.layer_norm1 = nn.LayerNorm(d_model)
self.layer_norm2 = nn.LayerNorm(d_model)
self.layer_norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, memory, self_mask=None, cross_mask=None):
# Self-attention
attn_output = self.self_attention(x, x, x, self_mask)
x = self.layer_norm1(x + self.dropout(attn_output))
# Cross-attention (Encoder-Decoder attention)
attn_output = self.cross_attention(x, memory, memory, cross_mask)
x = self.layer_norm2(x + self.dropout(attn_output))
# Feed-forward
ff_output = self.feed_forward(x)
x = self.layer_norm3(x + self.dropout(ff_output))
return x
# Transformer Encoder
class Encoder(nn.Module):
def __init__(self, num_layers, d_model, n_heads, d_ff, dropout=0.1):
super(Encoder, self).__init__()
self.layers = nn.ModuleList([
EncoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)
])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
return self.norm(x)
# Transformer Decoder
class Decoder(nn.Module):
def __init__(self, num_layers, d_model, n_heads, d_ff, dropout=0.1):
super(Decoder, self).__init__()
self.layers = nn.ModuleList([
DecoderLayer(d_model, n_heads, d_ff, dropout) for _ in range(num_layers)
])
self.norm = nn.LayerNorm(d_model)
def forward(self, x, memory, self_mask=None, cross_mask=None):
for layer in self.layers:
x = layer(x, memory, self_mask, cross_mask)
return self.norm(x)
# Full Transformer Model
class Transformer(nn.Module):
def __init__(self, input_dim, output_dim, d_model=512, n_heads=8, num_layers=6, d_ff=2048, dropout=0.1):
super(Transformer, self).__init__()
self.encoder = Encoder(num_layers, d_model, n_heads, d_ff, dropout)
self.decoder = Decoder(num_layers, d_model, n_heads, d_ff, dropout)
self.pos_encoder = PositionalEncoding(d_model)
self.embedding = nn.Embedding(input_dim, d_model)
self.fc_out = nn.Linear(d_model, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src, trg, src_mask=None, trg_mask=None):
src = self.embedding(src) * math.sqrt(self.embedding.embedding_dim)
trg = self.embedding(trg) * math.sqrt(self.embedding.embedding_dim)
src = self.pos_encoder(src)
trg = self.pos_encoder(trg)
memory = self.encoder(src, src_mask)
output = self.decoder(trg, memory, trg_mask, src_mask)
return self.fc_out(output)
# Example usage
if __name__ == "__main__":
src = torch.randint(0, 10, (32, 10)) # (batch_size, seq_len)
trg = torch.randint(0, 10, (32, 10))
model = Transformer(input_dim=10, output_dim=10)
out = model(src, trg)
print(out.shape) # (batch_size, seq_len, output_dim)
1. Autoencoder 和 Transformer 的对比
-
Autoencoder 的目的是通过 编码器(Encoder)将输入数据压缩成一个更低维的表示(通常是一个潜在空间的向量),然后通过 解码器(Decoder)从该低维表示重建出输入数据。编码器与解码器的功能是对称的,目的是 重建 输入数据。
-
Transformer 的 编码器-解码器结构 是 序列到序列(Seq2Seq) 的架构,目标是将 源语言序列(如德语)转换为 目标语言序列(如英语)。Transformer 的编码器和解码器并不是单纯的压缩和重建过程,而是学习从源语言到目标语言的 映射关系。
2. Transformer 的编码器(Encoder)和解码器(Decoder)
编码器(Encoder):
- 目标: 编码器的任务是将源语言的每个词映射到一个高维空间,提取源语言序列的上下文信息,并生成一个上下文向量(或一组向量)。这些向量携带了源语言的 语法 和 语义 信息。
- 编码器的工作不仅仅是数据压缩(像 Autoencoder 中的编码器那样),它更多是通过 自注意力机制(self-attention)学习源语言序列中不同词汇之间的关系,即 词与词之间的依赖关系。
在 Transformer 中,编码器的 自注意力机制 让它能够同时考虑序列中所有词的上下文,而不仅仅是局部的依赖。它通过这种方式学习到每个词与其他词的关系和语境。
解码器(Decoder):
- 目标: 解码器则利用编码器的输出(即上下文向量)和 目标语言的历史词汇 来预测下一个词。它的目标是从编码器生成的上下文信息中推断出一个新的序列,这个序列对应着目标语言的翻译(比如从德语到英语)。
- 自回归性: 解码器是自回归的,即每生成一个单词,就把它作为输入送入下一步的解码器中,直到整个序列被生成。
3. Autoencoder 与 Transformer 的区别
-
Autoencoder:
- 编码器: 将输入压缩成低维表示(潜在空间的向量),这种表示捕捉了输入的某些关键特征。
- 解码器: 根据潜在空间的表示重建出输入数据,目标是 尽量还原 原始数据。
-
Transformer:
- 编码器: 并不是单纯的压缩输入,而是通过 自注意力机制 学习到源语言序列中每个词之间的依赖关系,输出的是一个 上下文向量,它表示了输入序列的 语法 和 语义。
- 解码器: 根据上下文向量和历史目标语言单词,生成目标语言序列。解码器的输出并不是对输入数据的简单重建,而是对输入语言的 翻译 或 生成,即学习 源语言到目标语言的映射关系。
4. Transformer 如何学习推理下一个数据(单词)
在 Transformer 中,解码器的生成过程并不是简单地从编码器生成的向量中 “推理” 下一个数据,而是通过结合 自回归机制 和 上下文向量 来生成新的数据(即下一个词)。
以翻译为例:
-
编码器(Encoder):
- 输入:
<s> Ich bin ein Student </s>
(德语) - 编码器通过自注意力机制提取每个词的上下文信息,输出一个表示源语言的上下文向量。这些向量包含了德语句子的 语法 和 语义 信息。
- 输入:
-
解码器(Decoder):
- 初始输入:
<s>
(目标语言的起始符) - 解码器首先基于上下文向量和
<s>
来预测第一个目标词:I
(英语)。 - 然后,解码器将
I
作为下一个输入,同时继续利用源语言的上下文向量来生成下一个目标词:am
。 - 依此类推,解码器生成完整的目标语言序列:
<s> I am a student </s>
(英语)。
- 初始输入:
在这个过程中,解码器并不是通过 简单的推理 来生成下一个数据,而是利用 自注意力机制 和 历史信息 来基于当前生成的目标词、源语言的上下文向量,逐步生成下一个目标词。
5. 总结
- Autoencoder 的编码器将数据映射到潜在空间,解码器尝试重建数据。它们的目标是 重建输入数据。
- Transformer 的编码器通过自注意力机制将源语言序列映射到上下文向量,解码器基于上下文向量和目标语言的历史单词生成下一个单词,目标是 翻译或生成目标序列。
所以,虽然 Transformer 的编码器-解码器结构与 Autoencoder 有相似之处,但 Transformer 主要处理的是 序列到序列的任务(如翻译),并且通过上下文向量在 自注意力机制 的帮助下,学习源语言和目标语言之间的 关系和映射,而不仅仅是单纯的重建任务。