直接看这个-->Github
导包:
import re
import math
import torch
import numpy as np
from random import *
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
1. 数据预处理
1.1 构造单词表和映射
text = (
'Hello, how are you? I am Romeo.\n' # R
'Hello, Romeo My name is Juliet. Nice to meet you.\n' # J
'Nice to meet you too. How are you today?\n' # R
'Great. My baseball team won the competition.\n' # J
'Oh Congratulations, Juliet\n' # R
'Thank you Romeo\n' # J
'Where are you going today?\n' # R
'I am going shopping. What about you?\n' # J
'I am going to visit my grandmother. she is not very well' # R
)
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n') # filter '.', ',', '?', '!'
# 所有句子的单词list
word_list = list(set(" ".join(sentences).split())) # ['hello', 'how', 'are', 'you',...]
# 给单词表中所有单词设置序号
word2idx = {'[PAD]' : 0, '[CLS]' : 1, '[SEP]' : 2, '[MASK]' : 3}
for i, w in enumerate(word_list):
word2idx[w] = i + 4
# 用于 idx 映射回 word
idx2word = {i: w for i, w in enumerate(word2idx)}
vocab_size = len(word2idx) # 40
# token: 就是每个单词在词表中的index
token_list = list() # token_list存储了每一句的token
for sentence in sentences:
arr = [word2idx[s] for s in sentence.split()]
token_list.append(arr)
展示一下