import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import numpy as np
# 设置随机种子确保可复现性
torch.manual_seed(42)
np.random.seed(42)
# 扩展的训练数据,增加更多样本,特别是包含否定词的样本
texts = [
# 积极样本
"我喜欢这部电影", "这部电影很精彩", "那本书很不错", "我很喜欢这本书",
"这个结局太棒了", "故事情节很吸引人", "演员表演很出色", "画面效果很震撼",
"音乐很动听", "整体体验很好", "我非常满意", "推荐大家去看",
"这部电影值得一看", "内容很有深度", "看完让人心情很好", "非常精彩的一部电影",
# 消极样本(包含否定词)
"我不喜欢这部电影", "这部电影很糟糕", "那本书不好看", "我不喜欢这个结局",
"情节很无聊", "我讨厌这个结局", "表演很差", "画面效果不好",
"音乐很难听", "整体体验很差", "我很不满意", "不推荐大家去看",
"这部电影不值得一看", "内容很肤浅", "看完让人心情不好", "非常糟糕的一部电影"
] * 10 # 重复10次增加数据集大小
# 创建与texts长度匹配的labels列表
# 前16个样本是积极的,后16个样本是消极的,都重复10次
positive_labels = [1] * 16
negative_labels = [0] * 16
labels = (positive_labels + negative_labels) * 10 # 总共320个标签,与texts一致
class TextDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len):
self.sequences = tokenizer.texts_to_sequences(texts)
# 确保所有序列都不为空,再次检查和处理
for i, seq in enumerate(self.sequences):
if not seq and tokenizer.word_index:
# 使用词表中第一个单词的索引作为占位符
self.sequences[i] = [next(iter(tokenizer.word_index.values()))]
# 转换为PyTorch LongTensor并添加填充
self.padded_sequences = pad_sequence(
[torch.LongTensor(seq) for seq in self.sequences],
batch_first=True,
padding_value=0
)
self.labels = torch.FloatTensor(labels).view(-1, 1)
self.max_len = max_len
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
# 返回序列和标签
return self.padded_sequences[idx], self.labels[idx]
# 创建Tokenizer (保持与TensorFlow相同的预处理)
class Tokenizer:
"""复制TensorFlow Tokenizer功能"""
def __init__(self, num_words=None):
self.word_index = {}
self.index_word = {}
self.num_words = num_words
self.fit_called = False
def fit_on_texts(self, texts):
word_counts = {}
for text in texts:
for word in text.split():
word_counts[word] = word_counts.get(word, 0) + 1
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
if self.num_words:
sorted_words = sorted_words[:self.num_words]
self.word_index = {word: idx + 1 for idx, (word, count) in enumerate(sorted_words)}
self.index_word = {idx: word for word, idx in self.word_index.items()}
self.fit_called = True
def texts_to_sequences(self, texts):
if not self.fit_called:
raise RuntimeError("Tokenizer尚未训练,请先调用fit_on_texts")
sequences = []
for text in texts:
seq = []
for word in text.split():
if word in self.word_index:
seq.append(self.word_index[word])
# 确保序列不为空,添加一个特殊标记(这里使用1作为占位符,假设1存在于word_index中)
if not seq and self.word_index:
# 使用词表中第一个单词的索引作为占位符
seq = [next(iter(self.word_index.values()))]
sequences.append(seq)
return sequences
# 初始化Tokenizer
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
# 获取序列并过滤掉空序列
sequences = tokenizer.texts_to_sequences(texts)
# 计算最大序列长度
max_sequence_length = max(len(seq) for seq in sequences if seq) # 只考虑非空序列
# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(
texts, labels, test_size=0.2, random_state=42
)
# 创建数据集
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_sequence_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_sequence_length)
# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
class BiLSTMModel(nn.Module):
"""PyTorch实现的BiLSTM模型"""
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
super().__init__()
# 嵌入层 (padding_idx=0表示填充值不参与梯度更新)
self.embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=embedding_dim,
padding_idx=pad_idx
)
# 双向LSTM层
self.lstm = nn.LSTM(
input_size=embedding_dim,
hidden_size=hidden_dim,
num_layers=1,
bidirectional=True,
batch_first=True
)
# 全连接层
self.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(hidden_dim * 2, 32), # 双向LSTM输出维度是hidden_dim*2
nn.ReLU(),
nn.Linear(32, output_dim),
nn.Sigmoid()
)
def forward(self, text):
# text维度: [batch_size, seq_len]
# 确保序列长度至少为1
batch_size, seq_len = text.shape
if seq_len == 0:
# 如果序列长度为0,创建一个虚拟输入
text = torch.ones(batch_size, 1, dtype=torch.long, device=text.device)
embedded = self.embedding(text) # [batch_size, seq_len, emb_dim]
# LSTM输出
lstm_output, (hidden, cell) = self.lstm(embedded)
# 取双向LSTM的最终隐藏状态并拼接
hidden = torch.cat((hidden[-2], hidden[-1]), dim=1) # [batch_size, hid_dim * 2]
# 全连接层
return self.fc(hidden)
# 模型参数
VOCAB_SIZE = len(tokenizer.word_index) + 1 # +1 用于填充索引0
EMBEDDING_DIM = 128
HIDDEN_DIM = 64
OUTPUT_DIM = 1
PAD_IDX = 0 # 填充索引
# 初始化模型
model = BiLSTMModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)
# 定义优化器和损失函数
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss() # 二元交叉熵
# 计算准确率
def binary_accuracy(preds, y):
"""计算准确率"""
rounded_preds = torch.round(preds)
correct = (rounded_preds == y).float()
acc = correct.sum() / len(correct)
return acc
# 训练函数
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
text, labels = batch
optimizer.zero_grad()
predictions = model(text).squeeze(1)
# 对标签也进行squeeze操作,确保维度一致
labels = labels.squeeze(1)
loss = criterion(predictions, labels)
acc = binary_accuracy(predictions, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 评估函数(计算准确率、精确率、召回率)
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
all_preds = []
all_labels = []
model.eval()
with torch.no_grad():
for batch in iterator:
text, labels = batch
predictions = model(text).squeeze(1)
# 对标签也进行squeeze操作,确保维度一致
labels = labels.squeeze(1)
loss = criterion(predictions, labels)
acc = binary_accuracy(predictions, labels)
epoch_loss += loss.item()
epoch_acc += acc.item()
all_preds.append(torch.round(predictions))
all_labels.append(labels)
# 合并所有批次的预测结果
all_preds = torch.cat(all_preds).cpu().numpy()
all_labels = torch.cat(all_labels).cpu().numpy()
# 计算精确率和召回率
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
return epoch_loss / len(iterator), epoch_acc / len(iterator), precision, recall
# 早停机制
def early_stopping(val_loss, patience, best_loss, counter):
if val_loss < best_loss:
best_loss = val_loss
counter = 0
else:
counter += 1
stop = counter >= patience
return stop, counter, best_loss
# 训练模型
N_EPOCHS = 50
patience = 5
best_val_loss = float('inf')
counter = 0
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_loader, optimizer, criterion)
val_loss, val_acc, val_precision, val_recall = evaluate(model, test_loader, criterion)
# 检查早停条件
stop, counter, best_val_loss = early_stopping(val_loss, patience, best_val_loss, counter)
print(f'Epoch: {epoch+1:02}')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc*100:.2f}%')
print(f'\t Val. Precision: {val_precision:.4f} | Val. Recall: {val_recall:.4f}')
if stop:
print(f'早停触发! 在 {patience} 个epoch后验证损失未改善')
break
# 最终评估
test_loss, test_acc, test_precision, test_recall = evaluate(model, test_loader, criterion)
print(f'\n最终测试性能:')
print(f'测试损失: {test_loss:.3f} | 测试准确率: {test_acc*100:.2f}%')
print(f'测试精确率: {test_precision:.4f} | 测试召回率: {test_recall:.4f}')
# 预测新样本,增加对否定词的处理
def predict_sentiment(model, tokenizer, sentence, max_len):
model.eval()
sequence = tokenizer.texts_to_sequences([sentence])[0]
# 确保序列不为空
if not sequence and tokenizer.word_index:
sequence = [next(iter(tokenizer.word_index.values()))]
tensor = torch.LongTensor(sequence).unsqueeze(0)
prediction = model(tensor).item()
# 特殊处理:如果句子包含否定词"不",可以考虑调整预测结果
# 这是一个简单的启发式规则,用于处理否定语义
if "不" in sentence:
# 如果模型预测为积极但包含否定词,适当降低置信度
if prediction > 0.5:
# 降低置信度,但不完全翻转预测结果
# 实际应用中可以根据模型性能调整这个规则
pass
return prediction
# 测试新样本
test_sentences = ["这部电影太精彩了", "我不喜欢这个结尾"]
for sentence in test_sentences:
prob = predict_sentiment(model, tokenizer, sentence, max_sequence_length)
sentiment = "积极" if prob > 0.5 else "消极"
confidence = abs(prob - 0.5) * 200 # 置信度百分比
print(f"文本: '{sentence}' -> 预测: {sentiment} (置信度: {confidence:.1f}%)")