<think>我们有一个基于LSTM的唐诗生成模型,现在需要根据给定的代码框架和数据集路径进行实现。
数据集路径:`"AI数据集/神经网络与深度学习/poetry_data/poetry_data/poetry.txt"`
根据实验内容,我们需要完成以下步骤:
1. 准备数据:读取数据集,处理数据(分词、训练词向量等)
2. 定义基于LSTM的模型
3. 训练模型
4. 测试模型(生成藏头诗)
但是,我们注意到提供的代码片段中,数据准备部分使用的是`poetry_7.txt`,而我们要使用`poetry.txt`。因此,我们需要调整路径。
另外,由于原始代码中有些部分没有给出(如`Poetry_Dataset`类的定义),我们需要根据上下文进行推断并补全。
步骤:
1. 修改数据读取路径:将`poetry_7.txt`改为`poetry.txt`。
2. 由于数据集可能较大,我们可能需要调整训练样本数量(`train_num`)以控制训练时间。
由于代码较长,我将分步骤给出代码,并解释关键部分。
首先,我们导入必要的库:
</think>根据您的需求,我将为您实现基于LSTM的唐诗写作系统。以下是完整的代码实现:
```python
import os
import time
import pickle
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from gensim.models import Word2Vec
# 配置参数
config = {
"embedding_num": 128, # 词向量维度
"hidden_num": 256, # LSTM隐藏层维度
"batch_size": 64, # 批处理大小
"epochs": 50, # 训练轮数
"lr": 0.001, # 学习率
"optimizer": torch.optim.Adam, # 优化器
"train_num": 6000, # 训练样本数
"log_path": "./logs" # TensorBoard日志路径
}
# 数据预处理函数
def process_data(file_path="poetry.txt", train_num=6000):
"""加载并预处理诗歌数据"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"数据集文件 {file_path} 不存在")
# 读取诗歌数据
with open(file_path, "r", encoding="utf-8") as f:
all_poems = f.read().split("\n")[:train_num]
# 创建分词文件
split_file = "split.txt"
if not os.path.exists(split_file):
with open(split_file, "w", encoding="utf-8") as f:
for poem in all_poems:
# 将每首诗分词为单个字符
split_poem = " ".join(poem)
f.write(split_poem + "\n")
return all_poems
# 训练词向量
def train_word_vectors(vector_size=128, split_file="split.txt", train_num=6000):
"""训练词向量模型"""
param_file = "word_vec.pkl"
# 检查是否已有预训练词向量
if os.path.exists(param_file):
with open(param_file, "rb") as f:
return pickle.load(f)
# 加载分词数据
with open(split_file, "r", encoding="utf-8") as f:
split_data = [line.strip().split() for line in f.readlines()[:train_num]]
# 训练Word2Vec模型
model = Word2Vec(
sentences=split_data,
vector_size=vector_size,
window=5,
min_count=1,
workers=4,
epochs=10
)
# 保存词向量和映射关系
word_vectors = model.wv.vectors
word_to_index = model.wv.key_to_index
index_to_word = model.wv.index_to_key
with open(param_file, "wb") as f:
pickle.dump((word_vectors, word_to_index, index_to_word), f)
return word_vectors, word_to_index, index_to_word
# 数据集类
class PoetryDataset(Dataset):
def __init__(self, word_vectors, word_to_index, poems):
self.word_vectors = word_vectors
self.word_to_index = word_to_index
self.poems = poems
self.seq_length = 32 # 序列长度
def __len__(self):
return len(self.poems)
def __getitem__(self, idx):
poem = self.poems[idx].strip()
if len(poem) < self.seq_length + 1:
poem += " " * (self.seq_length + 1 - len(poem))
# 创建输入和目标序列
input_seq = []
target_seq = []
for i in range(len(poem) - 1):
char = poem[i]
next_char = poem[i+1]
# 获取字符索引
char_idx = self.word_to_index.get(char, 0)
next_char_idx = self.word_to_index.get(next_char, 0)
# 获取词向量
char_vec = self.word_vectors[char_idx]
input_seq.append(char_vec)
target_seq.append(next_char_idx)
if len(input_seq) >= self.seq_length:
break
# 转为Tensor
input_tensor = torch.tensor(np.array(input_seq), dtype=torch.float32)
target_tensor = torch.tensor(np.array(target_seq), dtype=torch.long)
return input_tensor, target_tensor
# LSTM诗歌模型
class PoetryModelLSTM(nn.Module):
def __init__(self, config):
super().__init__()
# 加载词向量
self.word_vectors, self.word_to_index, self.index_to_word = train_word_vectors(
vector_size=config["embedding_num"],
train_num=config["train_num"]
)
self.vocab_size = len(self.word_to_index)
self.embedding_dim = config["embedding_num"]
self.hidden_dim = config["hidden_num"]
self.batch_size = config["batch_size"]
self.num_layers = 2 # 双层LSTM
# 设备配置
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 模型架构
self.lstm = nn.LSTM(
input_size=self.embedding_dim,
hidden_size=self.hidden_dim,
num_layers=self.num_layers,
batch_first=True,
dropout=0.3
)
self.fc = nn.Linear(self.hidden_dim, self.vocab_size)
self.softmax = nn.LogSoftmax(dim=2)
def forward(self, x, hidden=None):
# 初始化隐藏状态
if hidden is None:
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(self.device)
else:
h0, c0 = hidden
# LSTM前向传播
out, hidden = self.lstm(x, (h0, c0))
# 全连接层
out = self.fc(out)
out = self.softmax(out)
return out, hidden
def train_model(self):
"""训练模型"""
# 创建数据集和数据加载器
dataset = PoetryDataset(
self.word_vectors,
self.word_to_index,
process_data(train_num=config["train_num"])
)
dataloader = DataLoader(
dataset,
batch_size=self.batch_size,
shuffle=True
)
# 优化器和损失函数
optimizer = config["optimizer"](self.parameters(), lr=config["lr"])
criterion = nn.NLLLoss()
self.to(self.device)
losses = []
# 创建日志目录
os.makedirs(config["log_path"], exist_ok=True)
writer = SummaryWriter(log_dir=os.path.join(
config["log_path"],
time.strftime('%Y%m%d_%H%M%S')
))
print(f"开始训练,共{len(dataloader)}个批次...")
for epoch in range(config["epochs"]):
total_loss = 0
self.train()
for batch_idx, (inputs, targets) in enumerate(dataloader):
inputs, targets = inputs.to(self.device), targets.to(self.device)
# 前向传播
outputs, _ = self(inputs)
# 计算损失
loss = criterion(
outputs.transpose(1, 2),
targets
)
# 反向传播和优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
# 每100批次打印一次
if batch_idx % 100 == 0:
avg_loss = total_loss / (batch_idx + 1)
print(f"Epoch [{epoch+1}/{config['epochs']}], "
f"Batch [{batch_idx}/{len(dataloader)}], "
f"Loss: {avg_loss:.4f}")
# 记录损失到TensorBoard
writer.add_scalar('Loss/train', avg_loss, epoch * len(dataloader) + batch_idx)
# 生成示例诗
self.generate_poetry_auto()
# 保存模型检查点
torch.save({
'epoch': epoch,
'model_state_dict': self.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': total_loss,
}, f"poetry_model_epoch_{epoch}.pth")
writer.close()
print("训练完成!")
return losses
def generate_poetry_auto(self, start_char=None, max_len=32):
"""自动生成诗歌"""
self.eval()
with torch.no_grad():
# 初始字符
if start_char is None:
start_char = np.random.choice(list(self.word_to_index.keys()))
# 转换为索引
start_idx = self.word_to_index.get(start_char, 0)
# 初始输入
input_vec = torch.tensor(
self.word_vectors[start_idx],
dtype=torch.float32
).unsqueeze(0).unsqueeze(0).to(self.device)
# 初始化隐藏状态
hidden = (
torch.zeros(self.num_layers, 1, self.hidden_dim).to(self.device),
torch.zeros(self.num_layers, 1, self.hidden_dim).to(self.device)
)
# 生成诗歌
poetry = start_char
for i in range(max_len - 1):
output, hidden = self(input_vec, hidden)
# 获取概率最高的下一个字符
prob = torch.exp(output.squeeze())
char_idx = torch.multinomial(prob, 1).item()
# 获取字符
char = self.index_to_word[char_idx]
poetry += char
# 准备下一个输入
input_vec = torch.tensor(
self.word_vectors[char_idx],
dtype=torch.float32
).unsqueeze(0).unsqueeze(0).to(self.device)
print(f"生成诗歌: {poetry}")
return poetry
def generate_acrostic_poetry(self, head_chars):
"""生成藏头诗"""
self.eval()
with torch.no_grad():
result = ""
punctuation = [",", "。", ",", "。"]
for i, char in enumerate(head_chars):
# 每行的开头字符
line = char
# 转换为索引
char_idx = self.word_to_index.get(char, 0)
# 初始输入
input_vec = torch.tensor(
self.word_vectors[char_idx],
dtype=torch.float32
).unsqueeze(0).unsqueeze(0).to(self.device)
# 初始化隐藏状态
hidden = (
torch.zeros(self.num_layers, 1, self.hidden_dim).to(self.device),
torch.zeros(self.num_layers, 1, self.hidden_dim).to(self.device)
)
# 生成一行诗(5-7个字符)
for j in range(6): # 一行通常有7个字符(含开头)
output, hidden = self(input_vec, hidden)
# 获取概率最高的下一个字符
prob = torch.exp(output.squeeze())
char_idx = torch.multinomial(prob, 1).item()
# 获取字符
next_char = self.index_to_word[char_idx]
line += next_char
# 准备下一个输入
input_vec = torch.tensor(
self.word_vectors[char_idx],
dtype=torch.float32
).unsqueeze(0).unsqueeze(0).to(self.device)
# 添加标点
result += line + punctuation[i]
print(f"藏头诗[{head_chars}]: {result}")
return result
# 主程序
if __name__ == "__main__":
# 初始化模型
model = PoetryModelLSTM(config)
# 选项:训练或直接加载预训练模型
train_model = True
if train_model:
# 训练模型
model.train_model()
else:
# 加载预训练模型(示例)
model.load_state_dict(torch.load("poetry_model.pth"))
model.to(model.device)
# 测试生成功能
print("\n自动生成诗歌示例:")
model.generate_poetry_auto("春")
print("\n藏头诗示例:")
model.generate_acrostic_poetry("春夏秋冬")
```
### 代码解释:
1. **数据预处理**:
- `process_data()`:加载诗歌数据集,每首诗分词为单个字符
- `train_word_vectors()`:使用Word2Vec训练词向量模型
2. **数据集类**:
- `PoetryDataset`:将诗歌数据转换为模型可用的输入-目标对
3. **LSTM模型**:
- 双层LSTM结构,带有Dropout防止过拟合
- 输入层:词嵌入层(使用预训练的词向量)
- 输出层:全连接层+LogSoftmax
4. **训练过程**:
- 使用负对数似然损失(NLLLoss)
- Adam优化器
- 每100批次记录损失并生成示例诗
- 支持TensorBoard可视化训练过程
5. **诗歌生成**:
- `generate_poetry_auto()`:自动生成完整诗歌
- `generate_acrostic_poetry()`:生成藏头诗(输入4个汉字)
6. **实用功能**:
- 模型检查点保存
- 设备自动检测(CPU/GPU)
- 随机诗歌开头生成
### 使用说明:
1. **训练模型**:
```python
model = PoetryModelLSTM(config)
model.train_model()
```
2. **生成诗歌**:
```python
# 自动生成
model.generate_poetry_auto("春")
# 生成藏头诗
model.generate_acrostic_poetry("人工智能")
```
3. **参数调整**:
- 修改`config`字典中的参数:
- `embedding_num`:词向量维度
- `hidden_num`:LSTM隐藏层维度
- `batch_size`:批处理大小
- `epochs`:训练轮数
- `lr`:学习率
### 注意事项:
1. 首次运行会自动训练词向量(可能需要较长时间)
2. 确保数据集路径正确(默认使用`poetry.txt`)
3. 训练过程中会自动保存模型检查点
4. 使用GPU可显著加速训练过程