提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
tips
写这篇文章的目的是防止在RNN
上踩过的坑,以后再来一次。毕竟结构性的内容不难理解,最重要的代码实现往往最容易出问题,相信一起深耕过paper然后写代码的你们应该也经历过。(other:我个人学习能力属于差生,语言沟通也不是很好,所以文章的问题还望多提意见)
rnn 的核心内容
此处借用了沐神、教父…(后面的名字越来越离谱)的图来说明
参考:https://zh-v2.d2l.ai/chapter_recurrent-neural-networks/rnn.html
非常简单的两个公式
H t = W x h × X t − 1 + W h h × H t − 1 + b h X p r e d i c t = W q × H t + b q H_t = W_{xh} \times X_{t-1} + W_{hh} \times H_{t-1} + b_h \\ X_{predict} = W_{q} \times Ht + b_q Ht=Wxh×Xt−1+Whh×Ht−1+bhXpredict=Wq×Ht+bq
在此有一个编码中的误区,那就是如果输入
X
t
−
1
X_{t-1}
Xt−1 ,在得到预测结果
X
p
r
e
d
i
c
t
X_{predict}
Xpredict,
得到的
H
t
H_t
Ht在图解中,需要参与下一次
X
t
X_t
Xt纵向的预测,以及生成
H
t
+
1
H_{t+1}
Ht+1
rnn 单个句子的实验代码设计
根据上述原理可以初步设计一个,只有一个句子情况下的rnn编码。
import torch
from torch import nn
import re
import collections
from torch.nn import functional as F
sentence = "this is a difficult problem,is not simple"
def get_words_from_sentence(str):
return re.sub("[^a-zA-Z]", " ", str).split()
def get_token_to_idx(words):
words_corpus = collections.Counter(words)
return {token: idx for idx, (token, _) in enumerate(words_corpus.items())}
def get_idx_to_token(words):
return [k for k in get_token_to_idx(words)]
def sgd(params, lr=0.01):
with torch.no_grad():
for param in params:
slope = param.grad
param -= slope * lr
param.grad.zero_()
def grad_clipping(params, theta):
"""裁剪梯度"""
norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
if norm > theta:
for param in params:
param.grad[:] *= theta / norm
device = torch.device("cuda:0")
words = get_words_from_sentence(sentence)
token_to_idx = get_token_to_idx(words)
idx_to_token = get_idx_to_token(words)
sentence_len = len(words)
sentence_to_idx = [token_to_idx.get(item) for item in words]
x = torch.tensor(sentence_to_idx[:-1], device=device)
y = torch.tensor(sentence_to_idx[1:], device=device)
x = F.one_hot(x, len(idx_to_token)).T
y = F.one_hot(y, len(idx_to_token)).T
W_xh = torch.normal(0.1, 0.1, (sentence_len - 1, 32), device=device, requires_grad=True)
W_hh = torch.normal(0.1, 0.1, (32, 32), device=device, requires_grad=True)
b_h = torch.zeros((32,), device=device, requires_grad=True)
W_xq = torch.normal(0.1, 0.1, (32, sentence_len - 1), device=device, requires_grad=True)
b_q = torch.zeros((sentence_len - 1,), device=device, requires_grad=True)
H = torch.ones((1, 32), device=device) * 0.1
criterion = nn.CrossEntropyLoss()
epochs = 20000
for epoch in range(epochs):
H = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(H.detach(), W_hh) + b_h)
y_hat = torch.mm(H, W_xq) + b_q
loss = criterion(y_hat.float(), y.float())
if (epoch + 1) % 100 == 0:
print(f"epoch {epoch + 1} / {epochs}: ---> loss:{loss:4f}")
print("--------------------------------------------------")
loss.backward()
grad_clipping(params=[W_xh, W_hh, b_h, W_xq, b_q], theta=loss)
with torch.no_grad():
sgd([W_xh, W_hh, b_h, W_xq, b_q])
def rnn(x, state):
state = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(state.detach(), W_hh) + b_h)
y = torch.mm(state, W_xq) + b_q
return y, state
prefix = torch.tensor([0, 1, 2], device=device)
state = torch.ones((1, 32), device=device) * 0.1
prefix = F.one_hot(prefix, 7)
prefix = prefix.reshape(prefix.shape[0], 1, prefix.shape[1])
num_pre = 3
outputs = [prefix[0]]
for y in prefix[1:]:
_, state = rnn(outputs[-1], state)
outputs.append(y)
for _ in range(num_pre):
y, state = rnn(outputs[-1], state)
outputs.append(y)
print([idx_to_token[torch.argmax(i)] for i in outputs])
沐神代码确实赏心悦目,借鉴了很多经验,但我的代码中仍有很多错误和反思
- one_hot 还是应该改成
词嵌入
的向量,随着句子加长,从损失率
就知道问题 - 此时代码并没有添加
掩码
,其实加入掩码
后,可以一对多,多对多,多对一…灵活设置 - 由于是一个句子,当时并没有发现这种设计效率其实很低下
- hidden 其实按照这种运算并没有参与横向的向后传递,而是把hidden视为一个矩阵,直接相乘。
- 实测中梯度下降堪忧
rnn的多句子实验代码设计
有了前面的经验,在多句子的实验中,会明显有不一样的改变
import torch
from processing import read_tokens, Vocab
from torch import nn
def get_params(vocab_size, num_hidden, device):
input_size = output_size = vocab_size
def normal(in_channel, out_channel):
return torch.randn((in_channel, out_channel), device=device, requires_grad=True)
w_xh = normal(input_size, num_hidden)
w_hh = normal(num_hidden, num_hidden)
b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)
w_xq = normal(output_size, num_hidden)
b_q = torch.zeros((output_size,), device=device, requires_grad=True)
return [w_xh, w_hh, b_h, w_xq, b_q]
def init_state(vocab_size, num_hidden, device):
return torch.zeros((vocab_size, num_hidden), device=device)
def rnn(inputs, params, state):
w_xh, w_hh, b_h, w_xq, b_q = params
outputs = []
for x in inputs:
state = torch.tanh(torch.mm(x, w_xh) + torch.mm(state.detach(), w_hh) + b_h)
y_hat = torch.mm(state, w_xq.T) + b_q
outputs.append(y_hat.reshape(1, y_hat.shape[0], y_hat.shape[1]))
return torch.cat(outputs, dim=0).transpose(1, 0), state
def one_hot(inputs, dict_size, num_steps, batch_size, device):
features = torch.zeros((batch_size, num_steps, dict_size))
for i in range(batch_size):
for j in range(num_steps):
features[i, j, inputs[i][j]] = 1
return features
class RNNScratch:
def __init__(self, vocab_size, num_hidden, forward_fn, init_state, device):
self.params = get_params(vocab_size, num_hidden, device)
self.num_hidden = num_hidden
self.vocab_size = vocab_size
self.device = device
self.forward_fn = forward_fn
self.init_state = init_state
def __call__(self, x):
batch_size = x.shape[0]
state = init_state(batch_size, self.num_hidden, self.device)
return self.forward_fn(x.transpose(1, 0), self.params, state)
def sgd(params, lr=0.05):
with torch.no_grad():
for param in params:
param -= param.grad * lr
param.grad.zero_()
def train(epochs, model, inputs, outputs, criterion, optimizer, device):
for epoch in range(epochs):
y_hat, _ = model(inputs.to(device))
loss = criterion(y_hat, outputs.to(device))
loss.backward()
optimizer(model.params)
if (epoch + 1) % 100 == 0:
print(f"{epoch + 1} / {epochs} ---- loss:{loss.item():.4f}")
def mask_processing(tokens, num_steps):
return tokens[:, :-num_steps], tokens[:, num_steps:]
def predict(characters, vocab, model, device):
inputs = torch.tensor([[vocab[character] for character in characters]])
inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)
outputs, _ = model(inputs.to(device))
outputs = outputs.reshape(-1, len(vocab))
result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
return result
def sample(model, vocab, out_len, str, device):
characters = str.strip().lower().split()
size = out_len - len(characters)
for _ in range(size):
character = predict(characters, vocab, model, device=device)
characters.append(character[-1])
print(" ".join(characters))
if __name__ == '__main__':
device = torch.device("cuda:0")
tokens = read_tokens()
vocab = Vocab(tokens)
vocab_size = len(vocab)
tokens_len = len(tokens)
tokens = torch.tensor([vocab[token] for token in tokens])
num_steps = 1
inputs, outputs = mask_processing(tokens, num_steps)
inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)
num_hidden = 12
model = RNNScratch(vocab_size, num_hidden, rnn, init_state, device=device)
criterion = nn.CrossEntropyLoss()
train(1000, model, inputs, outputs, criterion, sgd, device=device)
# predict(["do", "you"], vocab, model, device=device)
sample(model, vocab, 10, "how do you", device)
正如开头所说,hiddens是要不断传递的,正当我思考为何不将X转置,以时间序列计算时,我在沐神的代码中,再次印证了这个的想法:(自愧不如 : 上述代码我已修改为时间序转置)
上述代码,我也将原本句子的序列作为第一维度,进行设置(效果拔群)
但初学者一定要注意,修改维度,一定要线性代数基础,否则会出问题,所以我们在rnn输出函数里面的结果上也要进行序列转置(相当于还原)
至此该代码已基本将RNN原理实现。剩下的就是添加batch_size,因为实际开发中,不可能将一个巨量文本投入代码中学习,成本太高了。
nn.RNN 实现
import torch
from processing import read_tokens, Vocab
from torch import nn
def get_params(vocab_size, num_hidden, device):
input_size = output_size = vocab_size
def normal(in_channel, out_channel):
return torch.randn((in_channel, out_channel), device=device, requires_grad=True)
w_xh = normal(input_size, num_hidden)
w_hh = normal(num_hidden, num_hidden)
b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)
w_xq = normal(output_size, num_hidden)
b_q = torch.zeros((output_size,), device=device, requires_grad=True)
return [w_xh, w_hh, b_h, w_xq, b_q]
def init_state(batch_size, vocab_size, num_hidden, device):
return torch.zeros((batch_size, vocab_size, num_hidden), device=device)
def one_hot(inputs, dict_size, num_steps, batch_size, device):
features = torch.zeros((batch_size, num_steps, dict_size))
for i in range(batch_size):
for j in range(num_steps):
features[i, j, inputs[i][j]] = 1
return features
class RNNModel(nn.Module):
def __init__(self, vocab_size, num_hidden, n_layers, device, *args, **kwargs):
super().__init__(*args, **kwargs)
self.num_hidden = num_hidden
self.vocab_size = vocab_size
self.n_layers = n_layers
self.device = device
self.rnn = nn.RNN(vocab_size, num_hidden, num_layers=self.n_layers, device=device)
self.fc = nn.Linear(num_hidden, vocab_size, device=device)
def forward(self, x):
batch_size = x.shape[1]
state = init_state(self.n_layers, batch_size, self.num_hidden, self.device)
y, state = self.rnn(x, state)
return self.fc(y), state
def train(epochs, model, inputs, outputs, criterion, optimizer, device):
for epoch in range(epochs):
optimizer.zero_grad()
y_hat, _ = model(inputs.to(device))
loss = criterion(y_hat, outputs.to(device))
loss.backward()
optimizer.step()
if (epoch + 1) % 100 == 0:
print(f"{epoch + 1} / {epochs} ---- loss:{loss.item():.4f}")
def mask_processing(tokens, num_steps):
return tokens[:, :-num_steps], tokens[:, num_steps:]
def predict(characters, vocab, model, device):
inputs = torch.tensor([[vocab[character] for character in characters]])
inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)
outputs, _ = model(inputs.to(device))
outputs = outputs.reshape(-1, len(vocab))
result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
return result
def sample(model, vocab, out_len, str, device):
characters = str.strip().lower().split()
size = out_len - len(characters)
for _ in range(size):
character = predict(characters, vocab, model, device=device)
characters.append(character[-1])
print(" ".join(characters))
if __name__ == '__main__':
device = torch.device("cuda:0")
tokens = read_tokens()
vocab = Vocab(tokens)
vocab_size = len(vocab)
tokens_len = len(tokens)
tokens = torch.tensor([vocab[token] for token in tokens])
num_steps = 1
inputs, outputs = mask_processing(tokens, num_steps)
inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)
num_hidden = 12
model = RNNModel(vocab_size, num_hidden, n_layers=1, device=device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train(1000, model, inputs, outputs, criterion, optimizer, device=device)
# predict(["do", "you"], vocab, model, device=device)
sample(model, vocab, 10, "how do you", device)
上述数据预处理方式,在《动手学习深度学习》中有详细讲解。