【Embedding合集】使用LSTM模型实现对行为序列数据的Embeding¶
评论
使用LSTM模型实现对不定长的序列数据的Embedding
LSTM是一种递归神经网络(RNN)的变种,能够有效地捕捉和建模序列数据中的长期依赖关系。LSTM模型具有记忆单元和门控机制,可以根据序列中的上下文信息自适应地存储和遗忘信息,从而更好地处理序列中的长期依赖。
Embedding后的数据可用于计算用户行为序列的相似性,对付费、流失等行为进行精准预测。
import pandas as pd
import torch
from torch. nn. utils. rnn import pad_sequence
import torch. nn as nn
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
data = { 'user_id' : [ 1 , 2 , 3 , 4 ] ,
'sequences' : [
[ 'click' , 'play' , 'download' ] ,
[ 'upload' , 'click' ] ,
[ 'play' ] ,
[ 'add' , 'upload' , 'click' , 'play' ]
] ,
'labels' : [ 1 , 0 , 0 , 2 ] }
df = pd. DataFrame( data)
df. head( )
user_id sequences labels 0 1 [click, play, download] 1 1 2 [upload, click] 0 2 3 [play] 0 3 4 [add, upload, click, play] 2
word2idx = { }
idx = 1
for seq in df[ "sequences" ] :
for word in seq:
if word not in word2idx:
word2idx[ word] = idx
idx += 1
word2idx
{'click': 1, 'play': 2, 'download': 3, 'upload': 4, 'add': 5}
df[ "sequences" ] = [ [ word2idx[ word] for word in seq] for seq in df[ "sequences" ] ]
df. head( )
user_id sequences labels 0 1 [1, 2, 3] 1 1 2 [4, 1] 0 2 3 [2] 0 3 4 [5, 4, 1, 2] 2
padded_sequences = pad_sequence( [ torch. tensor( seq) for seq in df[ "sequences" ] ] , batch_first= True , padding_value= 0 )
expanded_labels = torch. zeros( len ( df) , 10 , dtype= torch. float32)
for i, label in enumerate ( df[ 'labels' ] ) :
expanded_labels[ i, : ] = torch. tensor( [ label] * 10 )
class LSTMEmbedding ( nn. Module) :
def __init__ ( self, vocab_size, embedding_dim, hidden_dim) :
super ( LSTMEmbedding, self) . __init__( )
self. embedding = nn. Embedding( vocab_size, embedding_dim)
self. lstm = nn. LSTM( embedding_dim, hidden_dim, batch_first= True )
self. fc = nn. Linear( hidden_dim, 10 )
def forward ( self, x) :
x = self. embedding( x)
lstm_out, ( h_n, c_n) = self. lstm( x)
x = self. fc( h_n. squeeze( 0 ) )
return x
dictionary_size = len ( word2idx) + 1
embedding_dim = 128
hidden_dim = 50
model = LSTMEmbedding( dictionary_size, embedding_dim, hidden_dim)
criterion = nn. MSELoss( )
optimizer = torch. optim. Adam( model. parameters( ) , lr= 0.001 )
Y_train = expanded_labels
for epoch in range ( 100 ) :
model. train( )
optimizer. zero_grad( )
outputs = model( padded_sequences)
loss = criterion( outputs, Y_train)
loss. backward( )
optimizer. step( )
if ( epoch+ 1 ) % 10 == 0 :
print ( f'Epoch [ { epoch+ 1 } /100], Loss: { loss. item( ) : .4f } ' )
Epoch [10/100], Loss: 0.7141
Epoch [20/100], Loss: 0.2935
Epoch [30/100], Loss: 0.0737
Epoch [40/100], Loss: 0.0163
Epoch [50/100], Loss: 0.0036
Epoch [60/100], Loss: 0.0022
Epoch [70/100], Loss: 0.0007
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0002
Epoch [100/100], Loss: 0.0001
model. eval ( )
with torch. no_grad( ) :
embeddings = model( padded_sequences)
tensor_list = embeddings. tolist( )
df[ 'Embedding' ] = tensor_list
df. head( )
user_id sequences labels Embedding 0 1 [1, 2, 3] 1 [0.9973136186599731, 0.9970695376396179, 1.003... 1 2 [4, 1] 0 [0.007720008492469788, 0.01210467517375946, -0... 2 3 [2] 0 [-0.0053714364767074585, -0.001586258411407470... 3 4 [5, 4, 1, 2] 2 [1.993236780166626, 2.0014617443084717, 1.9920...
embeddings
tensor([[ 9.9731e-01, 9.9707e-01, 1.0039e+00, 9.9692e-01, 9.9279e-01,
9.9808e-01, 9.9333e-01, 1.0054e+00, 1.0042e+00, 9.9489e-01],
[ 7.7200e-03, 1.2105e-02, -6.0443e-03, 2.2997e-02, 2.2623e-02,
-1.7220e-02, -6.5562e-03, -5.7095e-03, -1.0300e-03, 1.3729e-02],
[-5.3714e-03, -1.5863e-03, 1.4794e-02, -2.6556e-02, -2.1868e-02,
8.8659e-03, -9.2685e-06, 9.5384e-03, 1.4281e-04, -1.4242e-02],
[ 1.9932e+00, 2.0015e+00, 1.9920e+00, 2.0031e+00, 1.9985e+00,
1.9993e+00, 2.0001e+00, 1.9908e+00, 1.9968e+00, 1.9987e+00]])