主要是Dataset, DataLoader的使用
(1)数据处理,生成Batch和向量化词表
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
tokenizer = lambda x: [y for y in x]
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
def build_vocab(path, max_size, freq_min):
vocab_dic = {
}
with open(path, "r", encoding="utf-8") as f:
for sentences in tqdm(f):
sentence = sentences.strip()
if not sentence:
continue
content = sentence.split("\t")[0]
for word in tokenizer(content):
vocab_dic[word] = vocab_dic.get(word, 0) + 1
vocab_list = sorted([_ for _ in vocab_dic.items() if _[1] >= freq_min], key=lambda x: x[1], reverse=True)[
:max_size]
vocab_dic = {
word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
vocab_dic.update({
UNK: len(vocab_dic), PAD: len(vocab_dic) + 1})
return vocab_dic
def load_data(path, padding_size=32):
contents = []
vocab = build_vocab(path, max_size=10000, freq_min=1)
with open(path, "r", encoding="utf-8") as f:
for sentences in tqdm(f):
sentence = sentences.strip()
if not sentence:
continue
text, label = sentence.split("\t")
word_line = []
token = tokenizer(text)
seq_len = len(token)
if padding_size:
if len(token) < padding_size:
token.extend([PAD] * (padding_size - len(token)))