【从零手写RNN的感悟与心得】

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档


tips

写这篇文章的目的是防止在RNN上踩过的坑,以后再来一次。毕竟结构性的内容不难理解,最重要的代码实现往往最容易出问题,相信一起深耕过paper然后写代码的你们应该也经历过。(other:我个人学习能力属于差生,语言沟通也不是很好,所以文章的问题还望多提意见)

rnn 的核心内容

此处借用了沐神、教父…(后面的名字越来越离谱)的图来说明
参考:https://zh-v2.d2l.ai/chapter_recurrent-neural-networks/rnn.html
rnn原理图
非常简单的两个公式

H t = W x h × X t − 1 + W h h × H t − 1 + b h X p r e d i c t = W q × H t + b q H_t = W_{xh} \times X_{t-1} + W_{hh} \times H_{t-1} + b_h \\ X_{predict} = W_{q} \times Ht + b_q Ht=Wxh×Xt1+Whh×Ht1+bhXpredict=Wq×Ht+bq

在此有一个编码中的误区,那就是如果输入 X t − 1 X_{t-1} Xt1 ,在得到预测结果 X p r e d i c t X_{predict} Xpredict,
得到的 H t H_t Ht在图解中,需要参与下一次 X t X_t Xt纵向的预测,以及生成 H t + 1 H_{t+1} Ht+1

rnn 单个句子的实验代码设计

根据上述原理可以初步设计一个,只有一个句子情况下的rnn编码。

import torch
from torch import nn
import re
import collections
from torch.nn import functional as F

sentence = "this is a difficult problem,is not simple"


def get_words_from_sentence(str):
    return re.sub("[^a-zA-Z]", " ", str).split()


def get_token_to_idx(words):
    words_corpus = collections.Counter(words)
    return {token: idx for idx, (token, _) in enumerate(words_corpus.items())}


def get_idx_to_token(words):
    return [k for k in get_token_to_idx(words)]


def sgd(params, lr=0.01):
    with torch.no_grad():
        for param in params:
            slope = param.grad
            param -= slope * lr
            param.grad.zero_()


def grad_clipping(params, theta):
    """裁剪梯度"""
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


device = torch.device("cuda:0")

words = get_words_from_sentence(sentence)

token_to_idx = get_token_to_idx(words)
idx_to_token = get_idx_to_token(words)
sentence_len = len(words)

sentence_to_idx = [token_to_idx.get(item) for item in words]

x = torch.tensor(sentence_to_idx[:-1], device=device)
y = torch.tensor(sentence_to_idx[1:], device=device)

x = F.one_hot(x, len(idx_to_token)).T
y = F.one_hot(y, len(idx_to_token)).T

W_xh = torch.normal(0.1, 0.1, (sentence_len - 1, 32), device=device, requires_grad=True)
W_hh = torch.normal(0.1, 0.1, (32, 32), device=device, requires_grad=True)
b_h = torch.zeros((32,), device=device, requires_grad=True)
W_xq = torch.normal(0.1, 0.1, (32, sentence_len - 1), device=device, requires_grad=True)
b_q = torch.zeros((sentence_len - 1,), device=device, requires_grad=True)

H = torch.ones((1, 32), device=device) * 0.1
criterion = nn.CrossEntropyLoss()

epochs = 20000
for epoch in range(epochs):
    H = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(H.detach(), W_hh) + b_h)
    y_hat = torch.mm(H, W_xq) + b_q

    loss = criterion(y_hat.float(), y.float())
    if (epoch + 1) % 100 == 0:
        print(f"epoch {epoch + 1} / {epochs}: ---> loss:{loss:4f}")
        print("--------------------------------------------------")
    loss.backward()

    grad_clipping(params=[W_xh, W_hh, b_h, W_xq, b_q], theta=loss)

    with torch.no_grad():
        sgd([W_xh, W_hh, b_h, W_xq, b_q])


def rnn(x, state):
    state = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(state.detach(), W_hh) + b_h)
    y = torch.mm(state, W_xq) + b_q
    return y, state


prefix = torch.tensor([0, 1, 2], device=device)
state = torch.ones((1, 32), device=device) * 0.1
prefix = F.one_hot(prefix, 7)
prefix = prefix.reshape(prefix.shape[0], 1, prefix.shape[1])
num_pre = 3
outputs = [prefix[0]]
for y in prefix[1:]:
    _, state = rnn(outputs[-1], state)
    outputs.append(y)
for _ in range(num_pre):
    y, state = rnn(outputs[-1], state)
    outputs.append(y)

print([idx_to_token[torch.argmax(i)] for i in outputs])

沐神代码确实赏心悦目,借鉴了很多经验,但我的代码中仍有很多错误和反思

  • one_hot 还是应该改成词嵌入的向量,随着句子加长,从损失率就知道问题
  • 此时代码并没有添加掩码,其实加入掩码后,可以一对多,多对多,多对一…灵活设置
  • 由于是一个句子,当时并没有发现这种设计效率其实很低下
  • hidden 其实按照这种运算并没有参与横向的向后传递,而是把hidden视为一个矩阵,直接相乘。
  • 实测中梯度下降堪忧

rnn的多句子实验代码设计

有了前面的经验,在多句子的实验中,会明显有不一样的改变

import torch
from processing import read_tokens, Vocab
from torch import nn


def get_params(vocab_size, num_hidden, device):
    input_size = output_size = vocab_size

    def normal(in_channel, out_channel):
        return torch.randn((in_channel, out_channel), device=device, requires_grad=True)

    w_xh = normal(input_size, num_hidden)
    w_hh = normal(num_hidden, num_hidden)
    b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)

    w_xq = normal(output_size, num_hidden)
    b_q = torch.zeros((output_size,), device=device, requires_grad=True)

    return [w_xh, w_hh, b_h, w_xq, b_q]


def init_state(vocab_size, num_hidden, device):
    return torch.zeros((vocab_size, num_hidden), device=device)


def rnn(inputs, params, state):
    w_xh, w_hh, b_h, w_xq, b_q = params
    outputs = []
    for x in inputs:
        state = torch.tanh(torch.mm(x, w_xh) + torch.mm(state.detach(), w_hh) + b_h)
        y_hat = torch.mm(state, w_xq.T) + b_q
        outputs.append(y_hat.reshape(1, y_hat.shape[0], y_hat.shape[1]))
    return torch.cat(outputs, dim=0).transpose(1, 0), state


def one_hot(inputs, dict_size, num_steps, batch_size, device):
    features = torch.zeros((batch_size, num_steps, dict_size))
    for i in range(batch_size):
        for j in range(num_steps):
            features[i, j, inputs[i][j]] = 1
    return features


class RNNScratch:
    def __init__(self, vocab_size, num_hidden, forward_fn, init_state, device):
        self.params = get_params(vocab_size, num_hidden, device)
        self.num_hidden = num_hidden
        self.vocab_size = vocab_size
        self.device = device
        self.forward_fn = forward_fn
        self.init_state = init_state

    def __call__(self, x):
        batch_size = x.shape[0]
        state = init_state(batch_size, self.num_hidden, self.device)
        return self.forward_fn(x.transpose(1, 0), self.params, state)


def sgd(params, lr=0.05):
    with torch.no_grad():
        for param in params:
            param -= param.grad * lr
            param.grad.zero_()


def train(epochs, model, inputs, outputs, criterion, optimizer, device):
    for epoch in range(epochs):
        y_hat, _ = model(inputs.to(device))
        loss = criterion(y_hat, outputs.to(device))
        loss.backward()
        optimizer(model.params)
        if (epoch + 1) % 100 == 0:
            print(f"{epoch + 1} / {epochs}  ----   loss:{loss.item():.4f}")


def mask_processing(tokens, num_steps):
    return tokens[:, :-num_steps], tokens[:, num_steps:]


def predict(characters, vocab, model, device):
    inputs = torch.tensor([[vocab[character] for character in characters]])
    inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)

    outputs, _ = model(inputs.to(device))
    outputs = outputs.reshape(-1, len(vocab))
    result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
    return result


def sample(model, vocab, out_len, str, device):
    characters = str.strip().lower().split()
    size = out_len - len(characters)
    for _ in range(size):
        character = predict(characters, vocab, model, device=device)
        characters.append(character[-1])
    print(" ".join(characters))


if __name__ == '__main__':
    device = torch.device("cuda:0")

    tokens = read_tokens()
    vocab = Vocab(tokens)
    vocab_size = len(vocab)
    tokens_len = len(tokens)
    tokens = torch.tensor([vocab[token] for token in tokens])

    num_steps = 1
    inputs, outputs = mask_processing(tokens, num_steps)
    inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
    outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)

    num_hidden = 12
    model = RNNScratch(vocab_size, num_hidden, rnn, init_state, device=device)

    criterion = nn.CrossEntropyLoss()
    train(1000, model, inputs, outputs, criterion, sgd, device=device)
    # predict(["do", "you"], vocab, model, device=device)
    sample(model, vocab, 10, "how do you", device)


正如开头所说,hiddens是要不断传递的,正当我思考为何不将X转置,以时间序列计算时,我在沐神的代码中,再次印证了这个的想法:(自愧不如 : 上述代码我已修改为时间序转置)
在这里插入图片描述
上述代码,我也将原本句子的序列作为第一维度,进行设置(效果拔群)
在这里插入图片描述
但初学者一定要注意,修改维度,一定要线性代数基础,否则会出问题,所以我们在rnn输出函数里面的结果上也要进行序列转置(相当于还原)
在这里插入图片描述
至此该代码已基本将RNN原理实现。剩下的就是添加batch_size,因为实际开发中,不可能将一个巨量文本投入代码中学习,成本太高了。

nn.RNN 实现

import torch
from processing import read_tokens, Vocab
from torch import nn


def get_params(vocab_size, num_hidden, device):
    input_size = output_size = vocab_size

    def normal(in_channel, out_channel):
        return torch.randn((in_channel, out_channel), device=device, requires_grad=True)

    w_xh = normal(input_size, num_hidden)
    w_hh = normal(num_hidden, num_hidden)
    b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)

    w_xq = normal(output_size, num_hidden)
    b_q = torch.zeros((output_size,), device=device, requires_grad=True)

    return [w_xh, w_hh, b_h, w_xq, b_q]


def init_state(batch_size, vocab_size, num_hidden, device):
    return torch.zeros((batch_size, vocab_size, num_hidden), device=device)


def one_hot(inputs, dict_size, num_steps, batch_size, device):
    features = torch.zeros((batch_size, num_steps, dict_size))
    for i in range(batch_size):
        for j in range(num_steps):
            features[i, j, inputs[i][j]] = 1
    return features


class RNNModel(nn.Module):
    def __init__(self, vocab_size, num_hidden, n_layers, device, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_hidden = num_hidden
        self.vocab_size = vocab_size
        self.n_layers = n_layers
        self.device = device
        self.rnn = nn.RNN(vocab_size, num_hidden, num_layers=self.n_layers, device=device)
        self.fc = nn.Linear(num_hidden, vocab_size, device=device)

    def forward(self, x):
        batch_size = x.shape[1]
        state = init_state(self.n_layers, batch_size, self.num_hidden, self.device)
        y, state = self.rnn(x, state)
        return self.fc(y), state


def train(epochs, model, inputs, outputs, criterion, optimizer, device):
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_hat, _ = model(inputs.to(device))
        loss = criterion(y_hat, outputs.to(device))
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 100 == 0:
            print(f"{epoch + 1} / {epochs}  ----   loss:{loss.item():.4f}")


def mask_processing(tokens, num_steps):
    return tokens[:, :-num_steps], tokens[:, num_steps:]


def predict(characters, vocab, model, device):
    inputs = torch.tensor([[vocab[character] for character in characters]])
    inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)

    outputs, _ = model(inputs.to(device))
    outputs = outputs.reshape(-1, len(vocab))
    result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
    return result


def sample(model, vocab, out_len, str, device):
    characters = str.strip().lower().split()
    size = out_len - len(characters)
    for _ in range(size):
        character = predict(characters, vocab, model, device=device)
        characters.append(character[-1])
    print(" ".join(characters))


if __name__ == '__main__':
    device = torch.device("cuda:0")

    tokens = read_tokens()
    vocab = Vocab(tokens)
    vocab_size = len(vocab)
    tokens_len = len(tokens)
    tokens = torch.tensor([vocab[token] for token in tokens])

    num_steps = 1
    inputs, outputs = mask_processing(tokens, num_steps)
    inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
    outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)

    num_hidden = 12
    model = RNNModel(vocab_size, num_hidden, n_layers=1, device=device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    train(1000, model, inputs, outputs, criterion, optimizer, device=device)
    # predict(["do", "you"], vocab, model, device=device)
    sample(model, vocab, 10, "how do you", device)

上述数据预处理方式,在《动手学习深度学习》中有详细讲解。

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值