Encoding
sentence = "the quick brown fox jumped over the lazy dog"
words = sentence.split(' ') # 分词
print(words)
运行结果
['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']
注:set() 函数创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等。
x = set('runoob') y = set('google') print(x, y) # 重复的‘o’被删除
运行结果
{'b', 'o', 'u', 'n', 'r'} {'e', 'g', 'l', 'o'}
主线程序:
vocabl = list(set(words)) # 去除了重复的 ‘the’
print(vocabl)
运行结果
['lazy', 'the', 'quick', 'jumped', 'brown', 'fox', 'over', 'dog']
主线程序:
print(len(words))
print(len(vocabl))
运行结果
9
8
One-hot Encoding Example
需要纬度高,编码稀疏(信息量少)等
# convert words to indexes
word_to_ix1 = {word : i for i, word in enumerate(vocabl)} # convert to dict
print(word_to_ix1)
运行结果
{'lazy': 0, 'the': 1, 'quick': 2, 'jumped': 3, 'brown': 4, 'fox': 5, 'over': 6, 'dog': 7}
主线程序:
import torch
from torch.nn.functional import one_hot
words = torch.tensor([word_to_ix1[w] for w in vocabl], dtype=torch.long)
print(words)
one_hot_encoding = one_hot(words) # one-hot编码
print(vocabl)
print(one_hot_encoding) # 信息量太少
运行结果
tensor([0, 1, 2, 3, 4, 5, 6, 7])
['lazy', 'the', 'quick', 'jumped', 'brown', 'fox', 'over', 'dog']
tensor([[1, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 1]])
独热编码:需要纬度高,编码稀疏(信息量少)等缺点。此处不再用 one-hot 继续编写程序,要想了解其怎么应用,请参考我的另一个博客RNN实现字符排序。下面用 Embedding 高效率编码实现。
Word Embedding Example
训练一段话,使网络能够根据前两个词预测第三个词。
# Context is the number of words we are using as a context for the next word we want to predict.
CONTEXT_SIZE = 2
# Embedding dimension is the size of embedding vector
EMBEDDING_DIM = 10
# Size of the hidden Layer
HIDDEN_DIM = 256
# we will use Shakespeare Sonnet 2
test_sentence = """Tomorrow and tomorrow and tomorrow,
Creeps in this petty pace from day to day
To the last syllable of recorded time,
And all our yesterdays have lighted fools
The way to dusty death. Out, out, brief candle!
Life's but a walking shadow, a poor player
That struts and frets his hour upon the stage
And then is heard no more: it is a tale
Told by an idiot, full of sound and fury,
Signifying nothing.""".lower().split()
# Build a list of tuples. Each tuple is ([word_i-2, word_i-1], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[: 6])
vocab2 = list(set(test_sentence))
print('=='*50)
print(vocab2)
print('=='*50)
print(len(test_sentence))
print(len(vocab2))
word_to_ix2 = {word : i for i, word in enumerate(vocab2)}
print('=='*50)
print(word_to_ix2)
运行结果
[(['tomorrow', 'and'], 'tomorrow'), (['and', 'tomorrow'], 'and'), (['tomorrow', 'and'], 'tomorrow,'), (['and', 'tomorrow,'], 'creeps'), (['tomorrow,', 'creeps'], 'in'), (['creeps', 'in'], 'this')]
====================================================================================================
['that', 'an', 'told', 'his', 'our', 'no', 'by', 'to', 'of', 'signifying', 'brief', 'syllable', 'time,', 'fools', 'creeps', 'idiot,', 'walking', 'then', 'fury,', 'tomorrow,', 'out,', 'struts', 'is', 'but', 'poor', 'candle!', 'in', 'tomorrow', 'upon', 'stage', 'the', 'a', 'hour', 'heard', 'tale', 'and', 'recorded', 'full', 'it', 'nothing.', 'shadow,', 'petty', 'yesterdays', 'pace', 'frets', 'way', 'day', 'more:', 'have', 'last', 'lighted', 'death.', 'sound', 'this', 'all', 'player', 'from', "life's", 'dusty']
====================================================================================================
75
59
====================================================================================================
{'that': 0, 'an': 1, 'told': 2, 'his': 3, 'our': 4, 'no': 5, 'by': 6, 'to': 7, 'of': 8, 'signifying': 9, 'brief': 10, 'syllable': 11, 'time,': 12, 'fools': 13, 'creeps': 14, 'idiot,': 15, 'walking': 16, 'then': 17, 'fury,': 18, 'tomorrow,': 19, 'out,': 20, 'struts': 21, 'is': 22, 'but': 23, 'poor': 24, 'candle!': 25, 'in': 26, 'tomorrow': 27, 'upon': 28, 'stage': 29, 'the': 30, 'a': 31, 'hour': 32, 'heard': 33, 'tale': 34, 'and': 35, 'recorded': 36, 'full': 37, 'it': 38, 'nothing.': 39, 'shadow,': 40, 'petty': 41, 'yesterdays': 42, 'pace': 43, 'frets': 44, 'way': 45, 'day': 46, 'more:': 47, 'have': 48, 'last': 49, 'lighted': 50, 'death.': 51, 'sound': 52, 'this': 53, 'all': 54, 'player': 55, 'from': 56, "life's": 57, 'dusty': 58}
主线程序:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
class NGramLanguageModeler(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size): # 59, 10, 2
super(NGramLanguageModeler, self).__init__()
# vocab_size:嵌入层字典的大小(单词本里单词个数),embedding_dim: 每个产出向量的大小
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(context_size * embedding_dim, HIDDEN_DIM)
self.linear2 = nn.Linear(HIDDEN_DIM, vocab_size)
def forward(self, inputs):
"""因为是根据前两个单词,去预测第三个。input是输入的两个行索引,每行10列,每一行代表一个单词,其中Embedding是初始化好的矩阵,行数为单词本里单词个数,列数为embedding_dim。输入索引,取出两行 10 维向量,当做网络的输入数据去训练、去反向传播"""
embeds = self.embedding(inputs) # embeds: torch.Size([2, 10])
embeds = embeds.view((1, -1)) # embeds: torch.Size([1, 20])
out = F.relu(self.linear1(embeds))
out = self.linear2(out)
log_probs = F.log_softmax(out, dim=1)
return log_probs
learning_rate = 0.001
losses = []
loss_function = nn.NLLLoss() # negative log likehood
model = NGramLanguageModeler(len(vocab2), EMBEDDING_DIM, CONTEXT_SIZE) # 59, 10, 2
optimizer = optim.SGD(model.parameters(), lr=learning_rate)
from tqdm import tqdm # 加上进度条
for epoch in range(50):
total_loss = 0
iterator = tqdm(trigrams)
for context, target in iterator:
# Step 1. Prepare the inputs to be passed to the model(i.e, turn the words into integer indices and wrap them in tensors)
context_idxs = torch.tensor([word_to_ix2[w] for w in context], dtype=torch.long)
# Step 2. Recall that torch *accumulates* gradients. Before passing in a new instance,you need to zero out gradients from the old instance.
model.zero_grad()
# Step 3. Run the forward pass,getting log probabilities over next words
log_probs = model(context_idxs) # torch.Size([1, 59])
# Step 4. Compute your loss function. (Again,Torch wants the target word wrapped in a tensor)
# log_probs是torch.Size([1, 59]), 而torch.tensor([word_to_ix2[target]], dtype=torch.long) 是一个具体的LongTensor型数字,它会被自动转换为one-hot进行训练
loss = loss_function(log_probs, torch.tensor([word_to_ix2[target]], dtype=torch.long))
# Step 5. Do the backward pass and update the gradient
loss.backward()
optimizer.step()
# Get the Python number from a 1-elements Tensor calling tensor.item()
total_loss += loss.item()
iterator.set_postfix(loss=float(loss)) # 在进度条后面加上损失值 loss
losses.append(total_loss)
# add progress bar with epoch
运行结果
100%|██████████| 73/73 [00:01<00:00, 55.12it/s, loss=4.36]
100%|██████████| 73/73 [00:01<00:00, 43.03it/s, loss=4.32]
100%|██████████| 73/73 [00:00<00:00, 73.05it/s, loss=4.28]
100%|██████████| 73/73 [00:00<00:00, 107.01it/s, loss=4.24]
100%|██████████| 73/73 [00:00<00:00, 144.08it/s, loss=4.21]
100%|██████████| 73/73 [00:00<00:00, 170.22it/s, loss=4.17]
100%|██████████| 73/73 [00:00<00:00, 140.76it/s, loss=4.13]
100%|██████████| 73/73 [00:00<00:00, 142.40it/s, loss=4.1]
100%|██████████| 73/73 [00:00<00:00, 161.22it/s, loss=4.06]
100%|██████████| 73/73 [00:00<00:00, 132.12it/s, loss=4.03]
100%|██████████| 73/73 [00:00<00:00, 123.63it/s, loss=3.99]
100%|██████████| 73/73 [00:05<00:00, 13.89it/s, loss=3.95]
100%|██████████| 73/73 [00:01<00:00, 52.43it/s, loss=3.92]
100%|██████████| 73/73 [00:01<00:00, 65.12it/s, loss=3.88]
100%|██████████| 73/73 [00:02<00:00, 33.55it/s, loss=3.85]
100%|██████████| 73/73 [00:01<00:00, 71.55it/s, loss=3.81]
100%|██████████| 73/73 [00:01<00:00, 58.79it/s, loss=3.78]
100%|██████████| 73/73 [00:01<00:00, 46.27it/s, loss=3.75]
100%|██████████| 73/73 [00:01<00:00, 47.81it/s, loss=3.71]
100%|██████████| 73/73 [00:01<00:00, 52.66it/s, loss=3.68]
100%|██████████| 73/73 [00:00<00:00, 138.89it/s, loss=3.64]
100%|██████████| 73/73 [00:00<00:00, 112.26it/s, loss=3.61]
100%|██████████| 73/73 [00:00<00:00, 126.64it/s, loss=3.57]
100%|██████████| 73/73 [00:00<00:00, 122.00it/s, loss=3.54]
100%|██████████| 73/73 [00:00<00:00, 138.63it/s, loss=3.51]
100%|██████████| 73/73 [00:00<00:00, 130.24it/s, loss=3.47]
100%|██████████| 73/73 [00:00<00:00, 149.38it/s, loss=3.44]
100%|██████████| 73/73 [00:00<00:00, 138.10it/s, loss=3.4]
100%|██████████| 73/73 [00:00<00:00, 138.89it/s, loss=3.37]
100%|██████████| 73/73 [00:00<00:00, 130.47it/s, loss=3.34]
100%|██████████| 73/73 [00:00<00:00, 141.58it/s, loss=3.3]
100%|██████████| 73/73 [00:00<00:00, 132.12it/s, loss=3.27]
100%|██████████| 73/73 [00:00<00:00, 124.27it/s, loss=3.24]
100%|██████████| 73/73 [00:00<00:00, 154.10it/s, loss=3.2]
100%|██████████| 73/73 [00:00<00:00, 138.63it/s, loss=3.17]
100%|██████████| 73/73 [00:00<00:00, 154.74it/s, loss=3.14]
100%|██████████| 73/73 [00:00<00:00, 117.11it/s, loss=3.1]
100%|██████████| 73/73 [00:00<00:00, 114.55it/s, loss=3.07]
100%|██████████| 73/73 [00:00<00:00, 171.02it/s, loss=3.04]
100%|██████████| 73/73 [00:00<00:00, 176.79it/s, loss=3]
100%|██████████| 73/73 [00:00<00:00, 171.82it/s, loss=2.97]
100%|██████████| 73/73 [00:00<00:00, 131.17it/s, loss=2.94]
100%|██████████| 73/73 [00:00<00:00, 128.41it/s, loss=2.9]
100%|██████████| 73/73 [00:00<00:00, 160.53it/s, loss=2.87]
100%|██████████| 73/73 [00:00<00:00, 130.94it/s, loss=2.84]
100%|██████████| 73/73 [00:00<00:00, 149.38it/s, loss=2.81]
100%|██████████| 73/73 [00:00<00:00, 144.09it/s, loss=2.77]
100%|██████████| 73/73 [00:00<00:00, 159.13it/s, loss=2.74]
100%|██████████| 73/73 [00:00<00:00, 137.06it/s, loss=2.71]
100%|██████████| 73/73 [00:00<00:00, 176.37it/s, loss=2.68]
主线程序:
# check the structure of our model here
# model.eval()
Testing
输入前两个字母,查看能不能准确预测第三个。
import numpy
with torch.no_grad():
context3 = ['tomorrow', 'and']
context_idxs3 = torch.tensor([word_to_ix2[w] for w in context3], dtype=torch.long)
pred = model(context_idxs3) # torch.Size([1, 59])
index_of_prediction = numpy.argmax(pred)
print(vocab2[index_of_prediction])
运行结果
tomorrow,