NLP项目1-IMDB影评情感分析项目

阿值

已于 2023-05-12 08:30:57 修改

阅读量885

点赞数 1

分类专栏： NLP项目文章标签：自然语言处理人工智能深度学习

于 2023-05-11 23:40:05 首次发布

本文链接：https://blog.youkuaiyun.com/weixin_60536251/article/details/130632262

版权

NLP项目专栏收录该内容

9 篇文章

订阅专栏

NLP项目1-IMDB影评情感分析

1.Imdb数据集下载
2.数据集读取
3.Tokens分词
4.Vocab词表
5.数据集预处理句子填充或截断
6.可迭代数据
7.组合数据加载器
8.双向循环神经网络
9.词向量
10.网络初始化
11.参数初始化
12.网络参数初始化
13.Glove词向量用别人的词向量
14.网络应用 Glove词向量
15.训练和评估
16.预测
17.Dltools工具

1.Imdb数据集下载

http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

2.数据集读取

import os
import torch
from torch import nn
import dltools
def read_imdb(data_dir, is_train):
    data, labels = [], []
    for label in ('pos', 'neg'):
        folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label)
        for file in os.listdir(folder_name):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '')
                data.append(review)
                labels.append(1 if label == 'pos' else 0)
    return data, labels
data_dir = r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\aclImdb'
train_data = read_imdb(data_dir, is_train=True)
print('训练集数目: ', len(train_data[0]))
训练集数目:  25000

for x, y in zip(train_data[0][:3], train_data[1][:3]):
    print('标签: ', y,'review: ', x[0:60])
标签:  1 review:  Bromwell High is a cartoon comedy. It ran at the same time a
标签:  1 review:  Homelessness (or Houselessness as George Carlin stated) has 
标签:  1 review:  Brilliant over-acting by Lesley Ann Warren. Best dramatic ho

3.Tokens分词

train_tokens = dltools.tokenize(train_data[0], token='word')
train_tokens
[['Bromwell',
  'High',
  'is',
  'a',
  'cartoon',
  'comedy.',
  'It',
  'ran',
  'at',
'for',
'yourself.'],
 ...]

4.Vocab词表

vocab = dltools.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>']) # 至少出现频率
len(vocab)
49347

dltools.set_figsize() # 数据探索
dltools.plt.xlabel('# tokens per review')
dltools.plt.ylabel('count')
dltools.plt.hist([len(line) for line in train_tokens], bins=range(0, 1000, 50))
(array([ 553., 2373., 6820., 4834., 2817., 1848., 1380., 1005.,  759.,
         581.,  437.,  349.,  257.,  207.,  174.,  133.,  116.,   85.,
          75.]),
 array([  0,  50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600,
        650, 700, 750, 800, 850, 900, 950]),

在这里插入图片描述

5.数据集预处理句子填充或截断

num_steps = 500 # 序列长度
train_features = torch.tensor([dltools.truncate_pad(vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
train_features.shape
torch.Size([25000, 500])

6.可迭代数据

train_iter = dltools.load_array((train_features, torch.tensor(train_data[1])), 64, ) # x,y,批次
for X, y in train_iter:
    print('X:', X.shape, ', y:', y.shape)
    break
print('小批次数量: ', len(train_iter))
X: torch.Size([64, 500]) , y: torch.Size([64])
小批次数量:  391

7.组合数据加载器

def load_data_imdb(data_dir, batch_size, num_steps=500):
    train_data = read_imdb(data_dir, True)
    test_data = read_imdb(data_dir, False)
    train_tokens = dltools.tokenize(train_data[0], token='word')
    test_tokens = dltools.tokenize(test_data[0], token='word')
    vocab = dltools.Vocab(train_tokens, min_freq=5, reserved_tokens=['<pad>'])
    train_features = torch.tensor([dltools.truncate_pad(vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([dltools.truncate_pad(vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])  
    train_iter = dltools.load_array((train_features, torch.tensor(train_data[1])), 64)
    test_iter = dltools.load_array((test_features, torch.tensor(train_data[1])), 64)
    return train_iter, test_iter, vocab
data_dir = r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\aclImdb'
train_iter, test_iter, vocab = load_data_imdb(data_dir, 64, num_steps=500)
len(train_iter)
391

len(test_iter)
391

for X, y in test_iter:
    print('X:', X.shape, ', y:', y.shape)
    print(X)
    print(y)
    break
print('小批次数量: ', len(train_iter))
X: torch.Size([64, 500]) , y: torch.Size([64])
tensor([[   9,   96,  122,  ...,    1,    1,    1],
        [4787,    0,  474,  ...,    1,    1,    1],
        [   9,  320,   42,  ...,    1,    1,    1],
        ...,
        [ 397,  219,    9,  ...,    1,    1,    1],
        [3891,  350,  455,  ...,    1,    1,    1],
        [   9,  208,  347,  ...,    1,    1,    1]])
tensor([0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0,
        0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0])
小批次数量:  391

8.双向循环神经网络

class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super().__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size) # 词嵌入层词向量
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True) # 编码层
        self.decoder = nn.Linear(4 * num_hiddens, 2) # 全连接层 解码层 2个类别
    def forward(self, inputs): # (batch_size, num_steps)
        # 循环神经网络输入的第一个维度是时间维 所以要对inputs做转置, 转置后形状(num_steps, batch_size)
        embedding = self.embedding(inputs.T)   #(num_steps, batch_size, embed_size)
        # 为提供内存利用率和效率, 调用flatten_parameters让parameters数据存放在内存中连续块中 contiguous
        self.encoder.flatten_parameters()
        # 返回两个东西: 输出, 上一个隐藏层在不同时间步的隐状态
        outputs, _ = self.encoder(embedding)   # (num_steps, batch_size, 2 * num_hiddens) 因为2层
        # 拼接：连接初始和最终时间步的隐藏状态, 做为全连接的输入
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)  
        outs = self.decoder(encoding) # (batch, 4 * num_hiddens)  2*2
        return outs

9.词向量

class TokenEmbedding:
    def __init__(self, file_path):
        self.idx_to_token, self.idx_to_vec = self._load_embedding(file_path)
        self.unknown_idx = 0
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)}   
    def _load_embedding(self, file_path): # 加载预训练词向量
        # bos eos unk...
        idx_to_token, idx_to_vec = ['<unk>'], []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                elems = line.rstrip().split(' ')
                token, elems = elems[0], [float(elem) for elem in elems[1:]]
                if len(elems)> 1: # 跳过fasttext的第一行
                    idx_to_token.append(token)
                    idx_to_vec.append(elems)               
        idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
        return idx_to_token, torch.tensor(idx_to_vec)   
    def __getitem__(self, tokens):
        indices = [self.token_to_idx.get(token, self.unknown_idx) for token in tokens]
        vecs = self.idx_to_vec[torch.tensor(indices)]
        return vecs
    def __len__(self):
        return len(self.idx_to_token)

10.网络初始化

embed_size, num_hiddens, num_layers = 100, 100, 2
devices = dltools.try_all_gpus()
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)

11.参数初始化

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_uniform_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if 'weight' in param:
                nn.init.xavier_uniform_(m._parameters[param])

12.网络参数初始化

net.apply(init_weights)
BiRNN(
  (embedding): Embedding(49347, 100)
  (encoder): LSTM(100, 100, num_layers=2, bidirectional=True)
  (decoder): Linear(in_features=400, out_features=2, bias=True)
)

13.Glove词向量用别人的词向量

glove_embedding = TokenEmbedding(r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\glove.6B\glove.6B.100d.txt')
vocab.idx_to_token
['<unk>',
 '<pad>',
 'the',
 'a',
'imagine',
 'total',
 ...]
embeds = glove_embedding[vocab.idx_to_token]
embeds.shape
torch.Size([49347, 100])

14.网络应用 Glove词向量

net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False

15.训练和评估

lr, num_epochs = 0.01, 100
trainer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss(reduction='none')
dltools.train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices)
loss 0.168, train acc 0.933, test acc 0.798
996.3 examples/sec on [device(type='cuda', index=0)]

在这里插入图片描述

16.预测

def predict_sentiment(net, vocab, sequence):
    sequence = torch.tensor(vocab[sequence.split()], device=dltools.try_gpu())
    label = torch.argmax(net(sequence.reshape(1, -1)), dim=1)
    return 'positive' if label == 1 else 'negative'
predict_sentiment(net, vocab, 'this moive is great')
'positive'
predict_sentiment(net, vocab, 'this moive is so bad')
'negative'

17.Dltools工具

(1)tokenize
def tokenize(lines, token='word'):
    """Split text lines into word or character tokens."""
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print('ERROR: unknown token type: ' + token)

(2)Vocab
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        # Sort according to frequencies
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The index for the unknown token is 0
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1
    def __len__(self):
        return len(self.idx_to_token)
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

(3)truncate_pad
def truncate_pad(line, num_steps, padding_token):
    """Truncate or pad sequences."""
    if len(line) > num_steps:
        return line[:num_steps]  # Truncate
    return line + [padding_token] * (num_steps - len(line))  # Pad

(4)load_array
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data iterator."""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

(5)train_ch13
def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs,
               devices=dltools.try_all_gpus()):
    """Train a model with mutiple GPUs (defined in Chapter 13)."""
    timer, num_batches = dltools.Timer(), len(train_iter)
    animator = dltools.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1],
                            legend=['train loss', 'train acc', 'test acc'])
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    for epoch in range(num_epochs):
        # Sum of training loss, sum of training accuracy, no. of examples,
        # no. of predictions
        metric = dltools.Accumulator(4)
        for i, (features, labels) in enumerate(train_iter):
            timer.start()
            l, acc = train_batch_ch13(net, features, labels, loss, trainer,
                                      devices)
            metric.add(l, acc, labels.shape[0], labels.numel())
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(
                    epoch + (i + 1) / num_batches,
                    (metric[0] / metric[2], metric[1] / metric[3], None))
        test_acc = dltools.evaluate_accuracy_gpu(net, test_iter)
        animator.add(epoch + 1, (None, None, test_acc))
    print(f'loss {metric[0] / metric[2]:.3f}, train acc '
          f'{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}')
    print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on '
          f'{str(devices)}')

(6)try_gpu
def try_gpu(i=0):
    """Return gpu(i) if exists, otherwise return cpu()."""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f'cuda:{i}')
    return torch.device('cpu')