1.Imdb数据集下载
http: // ai. stanford. edu/ ~ amaas/ data/ sentiment/ aclImdb_v1. tar. gz
2.数据集读取
import os
import torch
from torch import nn
import dltools
def read_imdb ( data_dir, is_train) :
data, labels = [ ] , [ ]
for label in ( 'pos' , 'neg' ) :
folder_name = os. path. join( data_dir, 'train' if is_train else 'test' , label)
for file in os. listdir( folder_name) :
with open ( os. path. join( folder_name, file ) , 'rb' ) as f:
review = f. read( ) . decode( 'utf-8' ) . replace( '\n' , '' )
data. append( review)
labels. append( 1 if label == 'pos' else 0 )
return data, labels
data_dir = r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\aclImdb'
train_data = read_imdb( data_dir, is_train= True )
print ( '训练集数目: ' , len ( train_data[ 0 ] ) )
训练集数目: 25000
for x, y in zip ( train_data[ 0 ] [ : 3 ] , train_data[ 1 ] [ : 3 ] ) :
print ( '标签: ' , y, 'review: ' , x[ 0 : 60 ] )
标签: 1 review: Bromwell High is a cartoon comedy. It ran at the same time a
标签: 1 review: Homelessness ( or Houselessness as George Carlin stated) has
标签: 1 review: Brilliant over- acting by Lesley Ann Warren. Best dramatic ho
3.Tokens分词
train_tokens = dltools. tokenize( train_data[ 0 ] , token= 'word' )
train_tokens
[ [ 'Bromwell' ,
'High' ,
'is' ,
'a' ,
'cartoon' ,
'comedy.' ,
'It' ,
'ran' ,
'at' ,
'for' ,
'yourself.' ] ,
. . . ]
4.Vocab词表
vocab = dltools. Vocab( train_tokens, min_freq= 5 , reserved_tokens= [ '<pad>' ] )
len ( vocab)
49347
dltools. set_figsize( )
dltools. plt. xlabel( '# tokens per review' )
dltools. plt. ylabel( 'count' )
dltools. plt. hist( [ len ( line) for line in train_tokens] , bins= range ( 0 , 1000 , 50 ) )
( array( [ 553. , 2373. , 6820. , 4834. , 2817. , 1848. , 1380. , 1005. , 759. ,
581. , 437. , 349. , 257. , 207. , 174. , 133. , 116. , 85. ,
75. ] ) ,
array( [ 0 , 50 , 100 , 150 , 200 , 250 , 300 , 350 , 400 , 450 , 500 , 550 , 600 ,
650 , 700 , 750 , 800 , 850 , 900 , 950 ] ) ,
5.数据集预处理 句子填充或截断
num_steps = 500
train_features = torch. tensor( [ dltools. truncate_pad( vocab[ line] , num_steps, vocab[ '<pad>' ] ) for line in train_tokens] )
train_features. shape
torch. Size( [ 25000 , 500 ] )
6.可迭代数据
train_iter = dltools. load_array( ( train_features, torch. tensor( train_data[ 1 ] ) ) , 64 , )
for X, y in train_iter:
print ( 'X:' , X. shape, ', y:' , y. shape)
break
print ( '小批次数量: ' , len ( train_iter) )
X: torch. Size( [ 64 , 500 ] ) , y: torch. Size( [ 64 ] )
小批次数量: 391
7.组合 数据加载器
def load_data_imdb ( data_dir, batch_size, num_steps= 500 ) :
train_data = read_imdb( data_dir, True )
test_data = read_imdb( data_dir, False )
train_tokens = dltools. tokenize( train_data[ 0 ] , token= 'word' )
test_tokens = dltools. tokenize( test_data[ 0 ] , token= 'word' )
vocab = dltools. Vocab( train_tokens, min_freq= 5 , reserved_tokens= [ '<pad>' ] )
train_features = torch. tensor( [ dltools. truncate_pad( vocab[ line] , num_steps, vocab[ '<pad>' ] ) for line in train_tokens] )
test_features = torch. tensor( [ dltools. truncate_pad( vocab[ line] , num_steps, vocab[ '<pad>' ] ) for line in test_tokens] )
train_iter = dltools. load_array( ( train_features, torch. tensor( train_data[ 1 ] ) ) , 64 )
test_iter = dltools. load_array( ( test_features, torch. tensor( train_data[ 1 ] ) ) , 64 )
return train_iter, test_iter, vocab
data_dir = r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\aclImdb'
train_iter, test_iter, vocab = load_data_imdb( data_dir, 64 , num_steps= 500 )
len ( train_iter)
391
len ( test_iter)
391
for X, y in test_iter:
print ( 'X:' , X. shape, ', y:' , y. shape)
print ( X)
print ( y)
break
print ( '小批次数量: ' , len ( train_iter) )
X: torch. Size( [ 64 , 500 ] ) , y: torch. Size( [ 64 ] )
tensor( [ [ 9 , 96 , 122 , . . . , 1 , 1 , 1 ] ,
[ 4787 , 0 , 474 , . . . , 1 , 1 , 1 ] ,
[ 9 , 320 , 42 , . . . , 1 , 1 , 1 ] ,
. . . ,
[ 397 , 219 , 9 , . . . , 1 , 1 , 1 ] ,
[ 3891 , 350 , 455 , . . . , 1 , 1 , 1 ] ,
[ 9 , 208 , 347 , . . . , 1 , 1 , 1 ] ] )
tensor( [ 0 , 0 , 0 , 0 , 1 , 1 , 0 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 0 , 1 , 1 , 1 , 1 ,
1 , 0 , 0 , 0 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 , 1 , 1 , 0 ,
0 , 1 , 1 , 0 , 0 , 1 , 1 , 0 , 1 , 1 , 1 , 0 , 1 , 0 , 1 , 0 ] )
小批次数量: 391
8.双向循环神经网络
class BiRNN ( nn. Module) :
def __init__ ( self, vocab_size, embed_size, num_hiddens, num_layers, ** kwargs) :
super ( ) . __init__( ** kwargs)
self. embedding = nn. Embedding( vocab_size, embed_size)
self. encoder = nn. LSTM( embed_size, num_hiddens, num_layers= num_layers, bidirectional= True )
self. decoder = nn. Linear( 4 * num_hiddens, 2 )
def forward ( self, inputs) :
embedding = self. embedding( inputs. T)
self. encoder. flatten_parameters( )
outputs, _ = self. encoder( embedding)
encoding = torch. cat( ( outputs[ 0 ] , outputs[ - 1 ] ) , dim= 1 )
outs = self. decoder( encoding)
return outs
9.词向量
class TokenEmbedding :
def __init__ ( self, file_path) :
self. idx_to_token, self. idx_to_vec = self. _load_embedding( file_path)
self. unknown_idx = 0
self. token_to_idx = { token: idx for idx, token in enumerate ( self. idx_to_token) }
def _load_embedding ( self, file_path) :
idx_to_token, idx_to_vec = [ '<unk>' ] , [ ]
with open ( file_path, 'r' , encoding= 'utf-8' ) as f:
for line in f:
elems = line. rstrip( ) . split( ' ' )
token, elems = elems[ 0 ] , [ float ( elem) for elem in elems[ 1 : ] ]
if len ( elems) > 1 :
idx_to_token. append( token)
idx_to_vec. append( elems)
idx_to_vec = [ [ 0 ] * len ( idx_to_vec[ 0 ] ) ] + idx_to_vec
return idx_to_token, torch. tensor( idx_to_vec)
def __getitem__ ( self, tokens) :
indices = [ self. token_to_idx. get( token, self. unknown_idx) for token in tokens]
vecs = self. idx_to_vec[ torch. tensor( indices) ]
return vecs
def __len__ ( self) :
return len ( self. idx_to_token)
10.网络初始化
embed_size, num_hiddens, num_layers = 100 , 100 , 2
devices = dltools. try_all_gpus( )
net = BiRNN( len ( vocab) , embed_size, num_hiddens, num_layers)
11.参数初始化
def init_weights ( m) :
if type ( m) == nn. Linear:
nn. init. xavier_uniform_( m. weight)
if type ( m) == nn. LSTM:
for param in m. _flat_weights_names:
if 'weight' in param:
nn. init. xavier_uniform_( m. _parameters[ param] )
12.网络参数初始化
net. apply ( init_weights)
BiRNN(
( embedding) : Embedding( 49347 , 100 )
( encoder) : LSTM( 100 , 100 , num_layers= 2 , bidirectional= True )
( decoder) : Linear( in_features= 400 , out_features= 2 , bias= True )
)
13.Glove词向量 用别人的词向量
glove_embedding = TokenEmbedding( r'D:\AIoT-深度学习视频版\深度学习基础\自然语言处理\NLP入门\data\glove.6B\glove.6B.100d.txt' )
vocab. idx_to_token
[ '<unk>' ,
'<pad>' ,
'the' ,
'a' ,
'imagine' ,
'total' ,
. . . ]
embeds = glove_embedding[ vocab. idx_to_token]
embeds. shape
torch. Size( [ 49347 , 100 ] )
14.网络应用 Glove词向量
net. embedding. weight. data. copy_( embeds)
net. embedding. weight. requires_grad = False
15.训练和评估
lr, num_epochs = 0.01 , 100
trainer = torch. optim. Adam( net. parameters( ) , lr= lr)
loss = nn. CrossEntropyLoss( reduction= 'none' )
dltools. train_ch13( net, train_iter, test_iter, loss, trainer, num_epochs, devices)
loss 0.168 , train acc 0.933 , test acc 0.798
996.3 examples/ sec on [ device( type = 'cuda' , index= 0 ) ]
16.预测
def predict_sentiment ( net, vocab, sequence) :
sequence = torch. tensor( vocab[ sequence. split( ) ] , device= dltools. try_gpu( ) )
label = torch. argmax( net( sequence. reshape( 1 , - 1 ) ) , dim= 1 )
return 'positive' if label == 1 else 'negative'
predict_sentiment( net, vocab, 'this moive is great' )
'positive'
predict_sentiment( net, vocab, 'this moive is so bad' )
'negative'
17.Dltools工具
( 1 ) tokenize
def tokenize ( lines, token= 'word' ) :
"""Split text lines into word or character tokens."""
if token == 'word' :
return [ line. split( ) for line in lines]
elif token == 'char' :
return [ list ( line) for line in lines]
else :
print ( 'ERROR: unknown token type: ' + token)
( 2 ) Vocab
class Vocab :
"""Vocabulary for text."""
def __init__ ( self, tokens= None , min_freq= 0 , reserved_tokens= None ) :
if tokens is None :
tokens = [ ]
if reserved_tokens is None :
reserved_tokens = [ ]
counter = count_corpus( tokens)
self. token_freqs = sorted ( counter. items( ) , key= lambda x: x[ 1 ] ,
reverse= True )
self. unk, uniq_tokens = 0 , [ '<unk>' ] + reserved_tokens
uniq_tokens += [
token for token, freq in self. token_freqs
if freq >= min_freq and token not in uniq_tokens]
self. idx_to_token, self. token_to_idx = [ ] , dict ( )
for token in uniq_tokens:
self. idx_to_token. append( token)
self. token_to_idx[ token] = len ( self. idx_to_token) - 1
def __len__ ( self) :
return len ( self. idx_to_token)
def __getitem__ ( self, tokens) :
if not isinstance ( tokens, ( list , tuple ) ) :
return self. token_to_idx. get( tokens, self. unk)
return [ self. __getitem__( token) for token in tokens]
def to_tokens ( self, indices) :
if not isinstance ( indices, ( list , tuple ) ) :
return self. idx_to_token[ indices]
return [ self. idx_to_token[ index] for index in indices]
( 3 ) truncate_pad
def truncate_pad ( line, num_steps, padding_token) :
"""Truncate or pad sequences."""
if len ( line) > num_steps:
return line[ : num_steps]
return line + [ padding_token] * ( num_steps - len ( line) )
( 4 ) load_array
def load_array ( data_arrays, batch_size, is_train= True ) :
"""Construct a PyTorch data iterator."""
dataset = data. TensorDataset( * data_arrays)
return data. DataLoader( dataset, batch_size, shuffle= is_train)
( 5 ) train_ch13
def train_ch13 ( net, train_iter, test_iter, loss, trainer, num_epochs,
devices= dltools. try_all_gpus( ) ) :
"""Train a model with mutiple GPUs (defined in Chapter 13)."""
timer, num_batches = dltools. Timer( ) , len ( train_iter)
animator = dltools. Animator( xlabel= 'epoch' , xlim= [ 1 , num_epochs] , ylim= [ 0 , 1 ] ,
legend= [ 'train loss' , 'train acc' , 'test acc' ] )
net = nn. DataParallel( net, device_ids= devices) . to( devices[ 0 ] )
for epoch in range ( num_epochs) :
metric = dltools. Accumulator( 4 )
for i, ( features, labels) in enumerate ( train_iter) :
timer. start( )
l, acc = train_batch_ch13( net, features, labels, loss, trainer,
devices)
metric. add( l, acc, labels. shape[ 0 ] , labels. numel( ) )
timer. stop( )
if ( i + 1 ) % ( num_batches // 5 ) == 0 or i == num_batches - 1 :
animator. add(
epoch + ( i + 1 ) / num_batches,
( metric[ 0 ] / metric[ 2 ] , metric[ 1 ] / metric[ 3 ] , None ) )
test_acc = dltools. evaluate_accuracy_gpu( net, test_iter)
animator. add( epoch + 1 , ( None , None , test_acc) )
print ( f'loss { metric[ 0 ] / metric[ 2 ] : .3f } , train acc '
f' { metric[ 1 ] / metric[ 3 ] : .3f } , test acc { test_acc: .3f } ' )
print ( f' { metric[ 2 ] * num_epochs / timer. sum ( ) : .1f } examples/sec on '
f' { str ( devices) } ' )
( 6 ) try_gpu
def try_gpu ( i= 0 ) :
"""Return gpu(i) if exists, otherwise return cpu()."""
if torch. cuda. device_count( ) >= i + 1 :
return torch. device( f'cuda: { i} ' )
return torch. device( 'cpu' )