【从零手写RNN的感悟与心得】

ScalaX

已于 2024-05-17 11:22:39 修改

阅读量1.4k

点赞数 53

文章标签： rnn 深度学习人工智能 pytorch

于 2024-05-16 15:47:10 首次发布

本文链接：https://blog.youkuaiyun.com/ScalaX/article/details/138955042

版权

提示：文章写完后，目录可以自动生成，如何生成可参考右边的帮助文档

文章目录

tips
rnn 的核心内容
rnn 单个句子的实验代码设计
rnn的多句子实验代码设计
nn.RNN 实现

tips

写这篇文章的目的是防止在RNN上踩过的坑，以后再来一次。毕竟结构性的内容不难理解，最重要的代码实现往往最容易出问题，相信一起深耕过paper然后写代码的你们应该也经历过。（other：我个人学习能力属于差生，语言沟通也不是很好，所以文章的问题还望多提意见）

rnn 的核心内容

此处借用了沐神、教父…(后面的名字越来越离谱)的图来说明
参考：https://zh-v2.d2l.ai/chapter_recurrent-neural-networks/rnn.html
rnn原理图
非常简单的两个公式

$H_t = W_{xh} \times X_{t-1} + W_{hh} \times H_{t-1} + b_h \\ X_{predict} = W_{q} \times Ht + b_q$

在此有一个编码中的误区，那就是如果输入 $X_{t-1}$ ，在得到预测结果 $X_{predict}$ ,
得到的 $H_t$ 在图解中，需要参与下一次 $X_t$ 纵向的预测，以及生成 $H_{t+1}$

rnn 单个句子的实验代码设计

根据上述原理可以初步设计一个，只有一个句子情况下的rnn编码。

import torch
from torch import nn
import re
import collections
from torch.nn import functional as F

sentence = "this is a difficult problem,is not simple"


def get_words_from_sentence(str):
    return re.sub("[^a-zA-Z]", " ", str).split()


def get_token_to_idx(words):
    words_corpus = collections.Counter(words)
    return {token: idx for idx, (token, _) in enumerate(words_corpus.items())}


def get_idx_to_token(words):
    return [k for k in get_token_to_idx(words)]


def sgd(params, lr=0.01):
    with torch.no_grad():
        for param in params:
            slope = param.grad
            param -= slope * lr
            param.grad.zero_()


def grad_clipping(params, theta):
    """裁剪梯度"""
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


device = torch.device("cuda:0")

words = get_words_from_sentence(sentence)

token_to_idx = get_token_to_idx(words)
idx_to_token = get_idx_to_token(words)
sentence_len = len(words)

sentence_to_idx = [token_to_idx.get(item) for item in words]

x = torch.tensor(sentence_to_idx[:-1], device=device)
y = torch.tensor(sentence_to_idx[1:], device=device)

x = F.one_hot(x, len(idx_to_token)).T
y = F.one_hot(y, len(idx_to_token)).T

W_xh = torch.normal(0.1, 0.1, (sentence_len - 1, 32), device=device, requires_grad=True)
W_hh = torch.normal(0.1, 0.1, (32, 32), device=device, requires_grad=True)
b_h = torch.zeros((32,), device=device, requires_grad=True)
W_xq = torch.normal(0.1, 0.1, (32, sentence_len - 1), device=device, requires_grad=True)
b_q = torch.zeros((sentence_len - 1,), device=device, requires_grad=True)

H = torch.ones((1, 32), device=device) * 0.1
criterion = nn.CrossEntropyLoss()

epochs = 20000
for epoch in range(epochs):
    H = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(H.detach(), W_hh) + b_h)
    y_hat = torch.mm(H, W_xq) + b_q

    loss = criterion(y_hat.float(), y.float())
    if (epoch + 1) % 100 == 0:
        print(f"epoch {epoch + 1} / {epochs}: ---> loss:{loss:4f}")
        print("--------------------------------------------------")
    loss.backward()

    grad_clipping(params=[W_xh, W_hh, b_h, W_xq, b_q], theta=loss)

    with torch.no_grad():
        sgd([W_xh, W_hh, b_h, W_xq, b_q])


def rnn(x, state):
    state = F.tanh(torch.mm(x.float(), W_xh) + torch.mm(state.detach(), W_hh) + b_h)
    y = torch.mm(state, W_xq) + b_q
    return y, state


prefix = torch.tensor([0, 1, 2], device=device)
state = torch.ones((1, 32), device=device) * 0.1
prefix = F.one_hot(prefix, 7)
prefix = prefix.reshape(prefix.shape[0], 1, prefix.shape[1])
num_pre = 3
outputs = [prefix[0]]
for y in prefix[1:]:
    _, state = rnn(outputs[-1], state)
    outputs.append(y)
for _ in range(num_pre):
    y, state = rnn(outputs[-1], state)
    outputs.append(y)

print([idx_to_token[torch.argmax(i)] for i in outputs])

沐神代码确实赏心悦目，借鉴了很多经验，但我的代码中仍有很多错误和反思

one_hot 还是应该改成词嵌入的向量，随着句子加长，从损失率就知道问题
此时代码并没有添加掩码，其实加入掩码后，可以一对多，多对多，多对一…灵活设置
由于是一个句子，当时并没有发现这种设计效率其实很低下
hidden 其实按照这种运算并没有参与横向的向后传递，而是把hidden视为一个矩阵，直接相乘。
实测中梯度下降堪忧

rnn的多句子实验代码设计

有了前面的经验，在多句子的实验中，会明显有不一样的改变

import torch
from processing import read_tokens, Vocab
from torch import nn


def get_params(vocab_size, num_hidden, device):
    input_size = output_size = vocab_size

    def normal(in_channel, out_channel):
        return torch.randn((in_channel, out_channel), device=device, requires_grad=True)

    w_xh = normal(input_size, num_hidden)
    w_hh = normal(num_hidden, num_hidden)
    b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)

    w_xq = normal(output_size, num_hidden)
    b_q = torch.zeros((output_size,), device=device, requires_grad=True)

    return [w_xh, w_hh, b_h, w_xq, b_q]


def init_state(vocab_size, num_hidden, device):
    return torch.zeros((vocab_size, num_hidden), device=device)


def rnn(inputs, params, state):
    w_xh, w_hh, b_h, w_xq, b_q = params
    outputs = []
    for x in inputs:
        state = torch.tanh(torch.mm(x, w_xh) + torch.mm(state.detach(), w_hh) + b_h)
        y_hat = torch.mm(state, w_xq.T) + b_q
        outputs.append(y_hat.reshape(1, y_hat.shape[0], y_hat.shape[1]))
    return torch.cat(outputs, dim=0).transpose(1, 0), state


def one_hot(inputs, dict_size, num_steps, batch_size, device):
    features = torch.zeros((batch_size, num_steps, dict_size))
    for i in range(batch_size):
        for j in range(num_steps):
            features[i, j, inputs[i][j]] = 1
    return features


class RNNScratch:
    def __init__(self, vocab_size, num_hidden, forward_fn, init_state, device):
        self.params = get_params(vocab_size, num_hidden, device)
        self.num_hidden = num_hidden
        self.vocab_size = vocab_size
        self.device = device
        self.forward_fn = forward_fn
        self.init_state = init_state

    def __call__(self, x):
        batch_size = x.shape[0]
        state = init_state(batch_size, self.num_hidden, self.device)
        return self.forward_fn(x.transpose(1, 0), self.params, state)


def sgd(params, lr=0.05):
    with torch.no_grad():
        for param in params:
            param -= param.grad * lr
            param.grad.zero_()


def train(epochs, model, inputs, outputs, criterion, optimizer, device):
    for epoch in range(epochs):
        y_hat, _ = model(inputs.to(device))
        loss = criterion(y_hat, outputs.to(device))
        loss.backward()
        optimizer(model.params)
        if (epoch + 1) % 100 == 0:
            print(f"{epoch + 1} / {epochs}  ----   loss:{loss.item():.4f}")


def mask_processing(tokens, num_steps):
    return tokens[:, :-num_steps], tokens[:, num_steps:]


def predict(characters, vocab, model, device):
    inputs = torch.tensor([[vocab[character] for character in characters]])
    inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)

    outputs, _ = model(inputs.to(device))
    outputs = outputs.reshape(-1, len(vocab))
    result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
    return result


def sample(model, vocab, out_len, str, device):
    characters = str.strip().lower().split()
    size = out_len - len(characters)
    for _ in range(size):
        character = predict(characters, vocab, model, device=device)
        characters.append(character[-1])
    print(" ".join(characters))


if __name__ == '__main__':
    device = torch.device("cuda:0")

    tokens = read_tokens()
    vocab = Vocab(tokens)
    vocab_size = len(vocab)
    tokens_len = len(tokens)
    tokens = torch.tensor([vocab[token] for token in tokens])

    num_steps = 1
    inputs, outputs = mask_processing(tokens, num_steps)
    inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
    outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)

    num_hidden = 12
    model = RNNScratch(vocab_size, num_hidden, rnn, init_state, device=device)

    criterion = nn.CrossEntropyLoss()
    train(1000, model, inputs, outputs, criterion, sgd, device=device)
    # predict(["do", "you"], vocab, model, device=device)
    sample(model, vocab, 10, "how do you", device)

正如开头所说，hiddens是要不断传递的，正当我思考为何不将X转置，以时间序列计算时，我在沐神的代码中，再次印证了这个的想法：（自愧不如 : 上述代码我已修改为时间序转置）
在这里插入图片描述
上述代码，我也将原本句子的序列作为第一维度，进行设置（效果拔群）

但初学者一定要注意，修改维度，一定要线性代数基础，否则会出问题，所以我们在rnn输出函数里面的结果上也要进行序列转置（相当于还原）
在这里插入图片描述
至此该代码已基本将RNN原理实现。剩下的就是添加batch_size,因为实际开发中，不可能将一个巨量文本投入代码中学习，成本太高了。

nn.RNN 实现

import torch
from processing import read_tokens, Vocab
from torch import nn


def get_params(vocab_size, num_hidden, device):
    input_size = output_size = vocab_size

    def normal(in_channel, out_channel):
        return torch.randn((in_channel, out_channel), device=device, requires_grad=True)

    w_xh = normal(input_size, num_hidden)
    w_hh = normal(num_hidden, num_hidden)
    b_h = torch.zeros((num_hidden,), device=device, requires_grad=True)

    w_xq = normal(output_size, num_hidden)
    b_q = torch.zeros((output_size,), device=device, requires_grad=True)

    return [w_xh, w_hh, b_h, w_xq, b_q]


def init_state(batch_size, vocab_size, num_hidden, device):
    return torch.zeros((batch_size, vocab_size, num_hidden), device=device)


def one_hot(inputs, dict_size, num_steps, batch_size, device):
    features = torch.zeros((batch_size, num_steps, dict_size))
    for i in range(batch_size):
        for j in range(num_steps):
            features[i, j, inputs[i][j]] = 1
    return features


class RNNModel(nn.Module):
    def __init__(self, vocab_size, num_hidden, n_layers, device, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.num_hidden = num_hidden
        self.vocab_size = vocab_size
        self.n_layers = n_layers
        self.device = device
        self.rnn = nn.RNN(vocab_size, num_hidden, num_layers=self.n_layers, device=device)
        self.fc = nn.Linear(num_hidden, vocab_size, device=device)

    def forward(self, x):
        batch_size = x.shape[1]
        state = init_state(self.n_layers, batch_size, self.num_hidden, self.device)
        y, state = self.rnn(x, state)
        return self.fc(y), state


def train(epochs, model, inputs, outputs, criterion, optimizer, device):
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_hat, _ = model(inputs.to(device))
        loss = criterion(y_hat, outputs.to(device))
        loss.backward()
        optimizer.step()
        if (epoch + 1) % 100 == 0:
            print(f"{epoch + 1} / {epochs}  ----   loss:{loss.item():.4f}")


def mask_processing(tokens, num_steps):
    return tokens[:, :-num_steps], tokens[:, num_steps:]


def predict(characters, vocab, model, device):
    inputs = torch.tensor([[vocab[character] for character in characters]])
    inputs = one_hot(inputs, len(vocab), inputs.shape[1], 1, device=device)

    outputs, _ = model(inputs.to(device))
    outputs = outputs.reshape(-1, len(vocab))
    result = [vocab.to_tokens(index) for index in torch.argmax(outputs, dim=1)]
    return result


def sample(model, vocab, out_len, str, device):
    characters = str.strip().lower().split()
    size = out_len - len(characters)
    for _ in range(size):
        character = predict(characters, vocab, model, device=device)
        characters.append(character[-1])
    print(" ".join(characters))


if __name__ == '__main__':
    device = torch.device("cuda:0")

    tokens = read_tokens()
    vocab = Vocab(tokens)
    vocab_size = len(vocab)
    tokens_len = len(tokens)
    tokens = torch.tensor([vocab[token] for token in tokens])

    num_steps = 1
    inputs, outputs = mask_processing(tokens, num_steps)
    inputs = one_hot(inputs, vocab_size, inputs.shape[1], tokens_len, device=device)
    outputs = one_hot(outputs, vocab_size, outputs.shape[1], tokens_len, device=device)

    num_hidden = 12
    model = RNNModel(vocab_size, num_hidden, n_layers=1, device=device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    train(1000, model, inputs, outputs, criterion, optimizer, device=device)
    # predict(["do", "you"], vocab, model, device=device)
    sample(model, vocab, 10, "how do you", device)