郁闷透顶

唉· 我们班换了个土八路老师· 真是。。??? 只会写片段·全部都是 还很吹牛· 完了·
import torchimport torch.nn as nnimport torch.optim as optimfrom torch.utils.data import Dataset, DataLoaderfrom torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequenceimport numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import accuracy_scoreimport jiebaimport osfrom tqdm import tqdmimport timeimport jsonimport matplotlib.pyplot as pltfrom matplotlib.ticker import MaxNLocator# 设置随机种子,保证结果可复现torch.manual_seed(42)np.random.seed(42)# 确保中文正常显示jieba.setLogLevel(jieba.logging.INFO)# 设置matplotlib支持中文plt.rcParams["font.family"] = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC"]plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题# 1. 数据处理class WeiboDataset(Dataset): def __init__(self, texts, labels, vocab=None, max_vocab_size=50000, min_freq=1): self.labels = labels # 分词 self.tokenized_texts = [jieba.lcut(text) for text in texts] if vocab is None: # 构建词汇表 vocab = {'<pad>': 0, '<unk>': 1} word_freq = {} for tokens in self.tokenized_texts: for token in tokens: word_freq[token] = word_freq.get(token, 0) + 1 # 按词频排序 sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True) # 构建词汇表 idx = 2 for word, freq in sorted_words: if freq >= min_freq and idx < max_vocab_size: vocab[word] = idx idx += 1 self.vocab = vocab # 将文本转换为索引 self.texts_idx = [] for tokens in self.tokenized_texts: self.texts_idx.append([vocab.get(token, vocab['<unk>']) for token in tokens]) def __len__(self): return len(self.labels) def __getitem__(self, idx): return torch.tensor(self.texts_idx[idx]), torch.tensor(self.labels[idx]) def get_vocab(self): return self.vocab# 2. 注意力机制import mathclass Attention(nn.Module): def __init__(self, hidden_size): super(Attention, self).__init__() self.hidden_size = hidden_size self.attn = nn.Linear(self.hidden_size, self.hidden_size) self.v = nn.Parameter(torch.rand(hidden_size)) # 使用math.sqrt处理整数 stdv = 1. / math.sqrt(self.v.size(0)) self.v.data.uniform_(-stdv, stdv) def forward(self, hidden, encoder_outputs): timestep = encoder_outputs.size(0) hidden = hidden.repeat(timestep, 1, 1).transpose(0, 1) encoder_outputs = encoder_outputs.transpose(0, 1) # 计算注意力得分 attn_energies = self.score(hidden, encoder_outputs) # 返回注意力权重 return torch.softmax(attn_energies, dim=1).unsqueeze(1) def score(self, hidden, encoder_outputs): # 计算注意力得分 energy = torch.tanh(self.attn(hidden + encoder_outputs)) energy = energy.transpose(1, 2) v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1) energy = torch.bmm(v, energy) return energy.squeeze(1)# 3. 模型定义class SentimentClassifier(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_size, output_dim, n_layers, bidirectional, dropout, use_attention=True): super(SentimentClassifier, self).__init__() # 嵌入层 self.embedding = nn.Embedding(vocab_size, embedding_dim) # RNN层(单向LSTM) self.rnn = nn.LSTM(embedding_dim, hidden_size, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout if n_layers > 1 else 0, batch_first=True) # 注意力层 self.use_attention = use_attention if use_attention: self.attention = Attention(hidden_size) # 单向RNN,隐藏维度为hidden_size # 全连接层(输入维度为hidden_size,单向RNN无需乘以2) self.fc = nn.Linear(hidden_size, output_dim) # Dropout层 self.dropout = nn.Dropout(dropout) self.hidden_size = hidden_size self.bidirectional = bidirectional self.n_layers = n_layers def forward(self, text, text_lengths): # 嵌入层 embedded = self.dropout(self.embedding(text)) # 打包序列以处理变长输入 packed_embedded = pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False) # RNN层 packed_output, (hidden, cell) = self.rnn(packed_embedded) # 解包序列 output, output_lengths = pad_packed_sequence(packed_output, batch_first=True) # 单向RNN,直接取最后一层隐藏状态 hidden = self.dropout(hidden[-1, :, :]) # 应用注意力机制 if self.use_attention: attn_weights = self.attention(hidden.unsqueeze(0), output.transpose(0, 1)) context = attn_weights.bmm(output) context = context.squeeze(1) hidden = self.dropout(context) # 全连接层 output = self.fc(hidden) return output# 4. 数据加载和处理函数def load_data(file_path): df = pd.read_csv(file_path) texts = df['review'].tolist() labels = df['label'].tolist() return texts, labels# 定义collate函数(全局作用域)def collate_batch(batch): texts, labels = zip(*batch) text_lengths = torch.tensor([len(text) for text in texts]) # 填充序列 texts_padded = pad_sequence(texts, batch_first=True, padding_value=0) return texts_padded, text_lengths, torch.tensor(labels)def create_data_loaders(train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, vocab=None, batch_size=64): # 创建数据集 train_dataset = WeiboDataset(train_texts, train_labels, vocab=vocab) val_dataset = WeiboDataset(val_texts, val_labels, vocab=train_dataset.get_vocab()) test_dataset = WeiboDataset(test_texts, test_labels, vocab=train_dataset.get_vocab()) vocab = train_dataset.get_vocab() # 创建数据加载器(CPU环境下优化:num_workers=0) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch, num_workers=0, pin_memory=False) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=0, pin_memory=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch, num_workers=0, pin_memory=False) return train_loader, val_loader, test_loader, vocab# 5. 训练和评估函数def train(model, iterator, optimizer, criterion, device): model.train() epoch_loss = 0 all_preds = [] all_labels = [] for texts, text_lengths, labels in tqdm(iterator, desc="Training"): texts, text_lengths, labels = texts.to(device), text_lengths.to(device), labels.to(device) optimizer.zero_grad() predictions = model(texts, text_lengths).squeeze(1) loss = criterion(predictions, labels) loss.backward() optimizer.step() epoch_loss += loss.item() # 计算准确率 preds = torch.argmax(predictions, dim=1) all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) accuracy = accuracy_score(all_labels, all_preds) return epoch_loss / len(iterator), accuracydef evaluate(model, iterator, criterion, device): model.eval() epoch_loss = 0 all_preds = [] all_labels = [] with torch.no_grad(): for texts, text_lengths, labels in tqdm(iterator, desc="Evaluating"): texts, text_lengths, labels = texts.to(device), text_lengths.to(device), labels.to(device) predictions = model(texts, text_lengths).squeeze(1) loss = criterion(predictions, labels) epoch_loss += loss.item() # 计算准确率 preds = torch.argmax(predictions, dim=1) all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) accuracy = accuracy_score(all_labels, all_preds) return epoch_loss / len(iterator), accuracydef predict_sentiment(model, sentence, vocab, device): model.eval() # 分词 tokens = jieba.lcut(sentence) # 转换为索引 indices = [vocab.get(token, vocab['<unk>']) for token in tokens] # 转换为张量 tensor = torch.tensor(indices).unsqueeze(0).to(device) # 序列长度 length = torch.tensor([len(indices)]).to(device) # 预测 prediction = model(tensor, length) # 获取预测类别 pred_class = torch.argmax(prediction, dim=1) return pred_class.item()# 新增:绘制训练历史曲线图def plot_training_history(history, save_path='training_history.png'): epochs = range(1, len(history) + 1) # 创建两个图像 plt.figure(figsize=(12, 5)) # 绘制损失曲线 plt.subplot(1, 2, 1) plt.plot(epochs, [h['train_loss'] for h in history], 'b-', label='训练损失') plt.plot(epochs, [h['val_loss'] for h in history], 'r-', label='验证损失') plt.title('训练和验证损失') plt.xlabel('轮次') plt.ylabel('损失') plt.legend() plt.grid(True) plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) # 绘制准确率曲线 plt.subplot(1, 2, 2) plt.plot(epochs, [h['train_acc'] for h in history], 'b-', label='训练准确率') plt.plot(epochs, [h['val_acc'] for h in history], 'r-', label='验证准确率') plt.title('训练和验证准确率') plt.xlabel('轮次') plt.ylabel('准确率') plt.legend() plt.grid(True) plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True)) plt.tight_layout() plt.savefig(save_path) print(f"训练历史曲线图已保存至 {save_path}") plt.close()# 6. 主函数def main(): # 设置设备(优先使用GPU) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f'Using device: {device}') if device.type == 'cuda': print(f'GPU name: {torch.cuda.get_device_name(0)}') else: print(f'CPU cores: {os.cpu_count()}') # 加载数据 print("Loading data...") file_path = 'weibo_senti_100k.csv' texts, labels = load_data(file_path) # 划分数据集 print("Splitting data...") train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.1, random_state=42) train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1, random_state=42) # 超参数设置(单向RNN、1层、hidden_size=128) hyperparameters = { 'batch_size': 128, # CPU环境下减小批次大小以降低内存压力 'embedding_dim': 100, 'hidden_size': 128, # 隐藏层维度 'output_dim': 2, # 二分类 'n_layers': 2, # RNN层数 'bidirectional': False, # 单向RNN 'dropout': 0.5, 'learning_rate': 0.001, 'n_epochs': 10, 'use_attention': True } # 创建数据加载器 print("Creating data loaders...") train_loader, val_loader, test_loader, vocab = create_data_loaders( train_texts, train_labels, val_texts, val_labels, test_texts, test_labels, batch_size=hyperparameters['batch_size'] ) print(f"Vocabulary size: {len(vocab)}") # 初始化模型 print("Initializing model...") model = SentimentClassifier( vocab_size=len(vocab), embedding_dim=hyperparameters['embedding_dim'], hidden_size=hyperparameters['hidden_size'], output_dim=hyperparameters['output_dim'], n_layers=hyperparameters['n_layers'], bidirectional=hyperparameters['bidirectional'], dropout=hyperparameters['dropout'], use_attention=hyperparameters['use_attention'] ).to(device) # 打印模型架构 print("Model architecture:") print(model) # 定义优化器和损失函数 optimizer = optim.Adam(model.parameters(), lr=hyperparameters['learning_rate']) criterion = nn.CrossEntropyLoss().to(device) # 训练模型 print("Training model...") best_val_loss = float('inf') training_history = [] for epoch in range(hyperparameters['n_epochs']): start_time = time.time() train_loss, train_acc = train(model, train_loader, optimizer, criterion, device) val_loss, val_acc = evaluate(model, val_loader, criterion, device) end_time = time.time() epoch_time = end_time - start_time if val_loss < best_val_loss: best_val_loss = val_loss torch.save(model.state_dict(), 'best_model.pt') print(f'保存最佳模型 (验证损失: {val_loss:.3f})') print(f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_time:.2f}s') print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') print(f'\t Val. Loss: {val_loss:.3f} | Val. Acc: {val_acc * 100:.2f}%') training_history.append({ 'epoch': epoch + 1, 'train_loss': train_loss, 'train_acc': train_acc, 'val_loss': val_loss, 'val_acc': val_acc, 'time': epoch_time }) # 保存训练历史 with open('training_history.json', 'w') as f: json.dump(training_history, f, indent=4) # 绘制训练历史曲线图 plot_training_history(training_history) # 加载最佳模型 model.load_state_dict(torch.load('best_model.pt')) # 在测试集上评估 test_loss, test_acc = evaluate(model, test_loader, criterion, device) print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc * 100:.2f}%') # 保存超参数和结果 results = { 'hyperparameters': hyperparameters, 'test_loss': test_loss, 'test_acc': test_acc, 'training_history': training_history } with open('results.json', 'w') as f: json.dump(results, f, indent=4) # 示例预测 examples = [ "这个产品的太棒了,我非常喜欢!", "这个服务太差劲了,简直是浪费时间!", "今天天气好,心情也跟着愉快起来。", "这个电影是无聊透顶,后悔来看了。" ] print("\n示例预测:") for example in examples: sentiment = predict_sentiment(model, example, vocab, device) print(f"文本: {example}") print(f"情感: {'积极' if sentiment == 1 else '消极'}") print()if __name__ == '__main__': main()这个代码运行出来的结果是什么
06-19
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值