# -*- coding: utf-8 -*-
"""
LSTM网络 - IMDB电影评论情感分析
使用TensorFlow自带的IMDB数据集
任务:判断电影评论是正面(positive)还是负面(negative)
"""
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
# ========== 第一步:加载IMDB数据集 ==========
print("=" * 60)
print("正在加载IMDB电影评论数据集...")
print("=" * 60)
# IMDB数据集包含25000条训练评论和25000条测试评论
# num_words=10000: 只保留最常见的10000个单词
# 每条评论已经被转换为单词索引序列
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=10000)
print(f"\n训练集大小: {len(x_train)} 条评论")
print(f"测试集大小: {len(x_test)} 条评论")
print(f"标签分布 - 正面评论(1): {sum(y_train)} 条, 负面评论(0): {len(y_train) - sum(y_train)} 条")
# 查看第一条评论的内容
print(f"\n第一条评论的单词索引序列长度: {len(x_train[0])}")
print(f"第一条评论的标签: {y_train[0]} ({'正面' if y_train[0] == 1 else '负面'})")
print(f"第一条评论的前50个单词索引: {x_train[0][:50]}")
# ========== 第二步:数据预处理 ==========
print("\n" + "=" * 60)
print("数据预处理...")
print("=" * 60)
# 问题:每条评论长度不同,需要统一长度
# 解决方案:使用padding将所有序列填充到相同长度
maxlen = 200 # 设置最大序列长度为200
print(f"\n将所有评论填充/截断到 {maxlen} 个单词")
# pad_sequences:
# - 如果评论长度 < maxlen,在前面填充0
# - 如果评论长度 > maxlen,从前面截断
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)
print(f"预处理后训练数据形状: {x_train.shape}") # (25000, 200)
print(f"预处理后测试数据形状: {x_test.shape}") # (25000, 200)
# ========== 第三步:构建LSTM模型 ==========
print("\n" + "=" * 60)
print("构建LSTM模型...")
print("=" * 60)
model = keras.Sequential([
# 第一层:词嵌入层 (Embedding Layer)
# 作用:将每个单词索引转换为128维的密集向量
# 输入:(batch_size, sequence_length) = (None, 200)
# 输出:(batch_size, sequence_length, embedding_dim) = (None, 200, 128)
# 参数说明:
# - 10000: 词汇表大小
# - 128: 嵌入向量维度
layers.Embedding(input_dim=10000, output_dim=128),
# 第二层:LSTM层
# 作用:处理序列数据,捕捉前后文关系
# 64: LSTM单元数量(隐藏状态维度)
# return_sequences=False: 只返回最后一个时间步的输出
# dropout=0.2: 随机丢弃20%的输入连接,防止过拟合
# recurrent_dropout=0.2: 随机丢弃20%的循环连接
layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
# 第三层:全连接层
# 64个神经元,ReLU激活
layers.Dense(64, activation='relu'),
# 第四层:Dropout层
# 随机丢弃50%的神经元,进一步防止过拟合
layers.Dropout(0.5),
# 第五层:输出层
# 1个神经元,sigmoid激活
# 输出范围[0, 1],表示正面评论的概率
layers.Dense(1, activation='sigmoid')
])
# 查看模型结构
print("\n模型架构:")
model.summary()
# 统计参数数量
total_params = model.count_params()
print(f"\n总参数数量: {total_params:,}")
# ========== 第四步:编译模型 ==========
print("\n" + "=" * 60)
print("编译模型...")
print("=" * 60)
model.compile(
optimizer='adam', # Adam优化器
loss='binary_crossentropy', # 二分类交叉熵损失
metrics=['accuracy'] # 评估指标:准确率
)
print("优化器: Adam")
print("损失函数: binary_crossentropy (二分类交叉熵)")
print("评估指标: accuracy (准确率)")
# ========== 第五步:训练模型 ==========
print("\n" + "=" * 60)
print("开始训练模型...")
print("=" * 60)
# 训练参数
epochs = 10 # 训练轮数
batch_size = 128 # 批次大小
history = model.fit(
x_train, y_train,
epochs=epochs,
batch_size=batch_size,
validation_split=0.2, # 使用20%的训练数据作为验证集
verbose=1
)
# ========== 第六步:评估模型 ==========
print("\n" + "=" * 60)
print("在测试集上评估模型...")
print("=" * 60)
test_loss, test_accuracy = model.evaluate(x_test, y_test, verbose=0)
print(f"\n测试集损失: {test_loss:.4f}")
print(f"测试集准确率: {test_accuracy * 100:.2f}%")
# ========== 第七步:绘制训练历史 ==========
print("\n绘制训练曲线...")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
# 绘制准确率曲线
ax1.plot(history.history['accuracy'], label='训练准确率', linewidth=2)
ax1.plot(history.history['val_accuracy'], label='验证准确率', linewidth=2)
ax1.set_title('模型准确率', fontsize=14, fontweight='bold')
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('准确率', fontsize=12)
ax1.legend(fontsize=10)
ax1.grid(True, alpha=0.3)
# 绘制损失曲线
ax2.plot(history.history['loss'], label='训练损失', linewidth=2)
ax2.plot(history.history['val_loss'], label='验证损失', linewidth=2)
ax2.set_title('模型损失', fontsize=14, fontweight='bold')
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('损失', fontsize=12)
ax2.legend(fontsize=10)
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('lstm_training_history.png', dpi=150, bbox_inches='tight')
print("训练曲线已保存为 lstm_training_history.png")
# ========== 第八步:预测示例 ==========
print("\n" + "=" * 60)
print("预测示例...")
print("=" * 60)
# 获取词汇表映射(单词 -> 索引)
word_index = keras.datasets.imdb.get_word_index()
# 创建反向映射(索引 -> 单词)
reverse_word_index = {value: key for key, value in word_index.items()}
def decode_review(encoded_review):
"""将索引序列解码为文本"""
# 注意:索引要减去3,因为0, 1, 2是保留索引
# 0: padding, 1: start of sequence, 2: unknown
return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review if i > 0])
# 预测前5条测试评论
print("\n预测前5条测试评论:")
predictions = model.predict(x_test[:5])
for i in range(5):
review_text = decode_review(x_test[i])
predicted_sentiment = predictions[i][0]
true_label = y_test[i]
print(f"\n{'='*50}")
print(f"评论 {i+1}:")
print(f"原文: {review_text[:200]}...") # 只显示前200个字符
print(f"真实标签: {'正面' if true_label == 1 else '负面'}")
print(f"预测概率: {predicted_sentiment:.4f}")
print(f"预测标签: {'正面' if predicted_sentiment > 0.5 else '负面'}")
print(f"预测{'正确✓' if (predicted_sentiment > 0.5) == true_label else '错误✗'}")
# ========== 第九步:自定义评论预测 ==========
print("\n" + "=" * 60)
print("测试自定义评论...")
print("=" * 60)
def predict_sentiment(review_text):
"""
预测自定义评论的情感
参数:
review_text: 评论文本(字符串)
返回:
预测的情感(正面/负面)和概率
"""
# 1. 将文本转换为小写并分词
words = review_text.lower().split()
# 2. 将单词转换为索引
encoded = [word_index.get(word, 2) + 3 for word in words]
# 3. 填充到固定长度
encoded = keras.preprocessing.sequence.pad_sequences([encoded], maxlen=maxlen)
# 4. 预测
prediction = model.predict(encoded, verbose=0)[0][0]
sentiment = "正面" if prediction > 0.5 else "负面"
return sentiment, prediction
# 测试几个自定义评论
test_reviews = [
"This movie was absolutely fantastic! I loved every minute of it.",
"Terrible movie, waste of time. I hated it.",
"It was okay, nothing special but not terrible either.",
"Amazing plot and great acting. Highly recommended!",
"Boring and predictable. Don't waste your money."
]
print("\n自定义评论预测结果:")
for i, review in enumerate(test_reviews, 1):
sentiment, probability = predict_sentiment(review)
print(f"\n{i}. 评论: {review}")
print(f" 预测: {sentiment} (概率: {probability:.4f})")
# ========== 第十步:保存模型 ==========
print("\n" + "=" * 60)
print("保存模型...")
print("=" * 60)
model.save('lstm_imdb_sentiment_model.h5')
print("\n模型已保存为 lstm_imdb_sentiment_model.h5")
# 保存模型信息
with open('model_info.txt', 'w', encoding='utf-8') as f:
f.write(f"LSTM情感分析模型信息\n")
f.write(f"=" * 50 + "\n")
f.write(f"数据集: IMDB电影评论\n")
f.write(f"任务: 二分类(正面/负面情感)\n")
f.write(f"训练样本数: {len(x_train)}\n")
f.write(f"测试样本数: {len(x_test)}\n")
f.write(f"词汇表大小: 10000\n")
f.write(f"序列最大长度: {maxlen}\n")
f.write(f"训练轮数: {epochs}\n")
f.write(f"批次大小: {batch_size}\n")
f.write(f"测试集准确率: {test_accuracy * 100:.2f}%\n")
f.write(f"总参数数量: {total_params:,}\n")
print("模型信息已保存为 model_info.txt")
print("\n" + "=" * 60)
print("训练完成!")
print("=" * 60)
# ========== 总结 ==========
print("\n项目总结:")
print(f"✓ 数据集: IMDB电影评论(50000条)")
print(f"✓ 模型: LSTM + Embedding")
print(f"✓ 任务: 情感分析(二分类)")
print(f"✓ 测试准确率: {test_accuracy * 100:.2f}%")
print(f"✓ 参数总数: {total_params:,}")
print(f"\n可以尝试的改进方向:")
print(f" 1. 增加LSTM层数或神经元数量")
print(f" 2. 使用双向LSTM (Bidirectional LSTM)")
print(f" 3. 调整嵌入维度和序列长度")
print(f" 4. 尝试GRU代替LSTM")
print(f" 5. 增加训练轮数")
在不改变代码底层的基础上,请帮我增加代码运行的时间统计输出
最新发布