numpy.pad使用指南

该博客介绍了Python中numpy库的pad函数,用于在数组的边界填充指定宽度的数据。示例包括一维数组的填充,如在数组两侧分别填充0,以及二维数组的填充,展示如何对不同轴进行填充,并能设置不同的填充值。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

numpy.pad(array, pad_width, mode=‘constant’, **kwargs)
使用函数 numpy.pad 介绍:
array:要填充的数组
pad_width:(before,after)简单来说就是前面填几和后面填几,如(2,3)就是前填2后填3
“constant”:默认填常数,可以选择别的
实例;
1.首先对一维的数组进行填充

a = [1,1,1,2,2,2,3,3,3,4,4,4]
a = np.pad(a, pad_width = (1,1), mode = 'constant')
print(a)
[0 1 1 1 2 2 2 3 3 3 4 4 4 0]

在a的左右各添加一个常数,默认常数为0
2.对一维的数组进行前后不一样值的填充

b = [1,1,2,2]
b = np.pad(b,(1,1), mode = 'constant', constant_values = (2,4))
print(b)
[2 1 1 2 2 4]

前2,后4
3.对2维数组进行扩充
注:numpy.pad的官方指南中用到了“axis”一词表示轴,可以理解为行,0 axis 就是第0行。

d = np.arange(0,10).reshape(2,5)
print(d)
d = np.pad(d, ((1,1),(2,2)), "constant")
#d = np.pad(d, (1,1), "constant")
print(d)
[[0 1 2 3 4]
 [5 6 7 8 9]]
[[0 0 0 0 0 0 0 0]
 [0 0 1 2 3 4 0 0]
 [0 5 6 7 8 9 0 0]
 [0 0 0 0 0 0 0 0]] 
import torch import torch.nn as nn import torch.optim as optim from torchtext.datasets import IMDB import os # 设置本地数据集路径 data_path = "C:\sdxx" # 加载数据 train_iter, test_iter = IMDB(root=data_path, split=('train', 'test')) from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator, GloVe import matplotlib.pyplot as plt from torch.utils.data.dataset import random_split import numpy as np import time from torch.utils.data import DataLoader import os from sklearn.metrics import confusion_matrix, classification_report # 禁用警告信息 import warnings warnings.filterwarnings('ignore') import torchtext torchtext.disable_torchtext_deprecation_warning() # 设置随机种子,保证结果可复现 torch.manual_seed(42) np.random.seed(42) # 检查GPU是否可用 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"使用设备: {device}") # 1. 数据预处理 tokenizer = get_tokenizer('basic_english') train_iter = IMDB(split='train') # 检查数据集格式 first_example = next(iter(train_iter)) print(f"数据集格式示例: {first_example}") # 构建词汇表 def yield_tokens(data_iter): for _, text in data_iter: yield tokenizer(text) vocab = build_vocab_from_iterator(yield_tokens(train_iter), min_freq=5, specials=["<pad>", "<unk>"]) vocab.set_default_index(vocab["<unk>"]) # 加载预训练词向量 (300d) print("加载预训练GloVe词向量...") vectors = GloVe(name='6B', dim=300) pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos()) print(f"预训练词向量形状: {pretrained_embedding.shape}") # 超参数 - 已优化配置 MAX_SEQ_LEN = 256 # 增加最大序列长度 EMBEDDING_DIM = 300 HIDDEN_DIM = 192 # 增加隐藏层维度 OUTPUT_DIM = 1 BATCH_SIZE = 32 # 减小批次大小增加随机性 EPOCHS = 15 # 增加最大训练轮次 LR = 5e-4 # 降低初始学习率 DROPOUT = 0.6 # 增加Dropout比率 accumulation_steps = 4 # 梯度累积步数 LSTM_LAYERS = 3 # 增加LSTM层数 WEIGHT_DECAY = 5e-4 # 增加权重衰减 # 数据批量处理函数 - 动态长度+标签处理 def collate_batch(batch): # 标签处理:将标签1映射为0(负例),标签2映射为1(正例) labels = [1 if example[0] == 2 else 0 for example in batch] labels = torch.tensor(labels).to(device) # 处理文本 - 动态长度 texts = [tokenizer(example[1]) for example in batch] # 计算当前批次平均长度,动态调整截断值 avg_len = sum(len(t) for t in texts) // len(texts) dynamic_len = min(avg_len + 25, MAX_SEQ_LEN) # 增加动态长度上限 texts = [vocab(text) for text in texts] texts = [torch.tensor([i for i in text[:dynamic_len]]) for text in texts] texts = torch.nn.utils.rnn.pad_sequence(texts, batch_first=True, padding_value=vocab["<pad>"]).to(device) return texts, labels # 重新加载训练集和测试集 train_iter = IMDB(split='train') test_iter = IMDB(split='test') # 使用原始数据(若需数据增强,可取消注释以下部分) train_dataset = list(train_iter) test_dataset = list(test_iter) # 划分训练集和验证集 train_size = int(0.8 * len(train_dataset)) val_size = len(train_dataset) - train_size train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size]) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch) val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch) test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch) # 检查数据分布 def check_data_distribution(): train_labels = [label for (label, _) in train_dataset] val_labels = [label for (label, _) in val_dataset] test_labels = [label for (label, _) in test_dataset] # 计算正例比例 train_pos_ratio = sum([1 for lbl in train_labels if lbl == 2]) / len(train_labels) val_pos_ratio = sum([1 for lbl in val_labels if lbl == 2]) / len(val_labels) test_pos_ratio = sum([1 for lbl in test_labels if lbl == 2]) / len(test_labels) print(f"训练集大小: {len(train_dataset)}, 正例比例: {train_pos_ratio:.4f}") print(f"验证集大小: {len(val_dataset)}, 正例比例: {val_pos_ratio:.4f}") print(f"测试集大小: {len(test_dataset)}, 正例比例: {test_pos_ratio:.4f}") # 检查批次数据 batch = next(iter(train_loader)) texts, labels = batch print(f"批次示例 - 文本形状: {texts.shape}, 标签形状: {labels.shape}") print(f"标签示例: {labels[:10].cpu().numpy()}") check_data_distribution() # 2. 改进模型结构 - 双向LSTM+注意力机制 class AttentionLSTM(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout=0.5, pretrained=None, num_layers=2): super().__init__() self.embedding = nn.Embedding(vocab_size, embedding_dim) # 使用预训练词向量 if pretrained is not None: self.embedding.weight.data.copy_(pretrained) self.embedding.weight.requires_grad = True # 微调预训练向量 self.lstm = nn.LSTM( embedding_dim, hidden_dim, batch_first=True, bidirectional=True, # 双向LSTM num_layers=num_layers, # 增加LSTM层数 dropout=dropout # 层间dropout ) # 注意力机制 self.attention = nn.Sequential( nn.Linear(hidden_dim * 2, hidden_dim), nn.Tanh(), nn.Linear(hidden_dim, 1), nn.Softmax(dim=1) ) self.dropout = nn.Dropout(dropout) # 添加额外的全连接层 self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim) self.fc2 = nn.Linear(hidden_dim, output_dim) def forward(self, text): # text: [batch_size, seq_len] embedded = self.embedding(text) # [batch_size, seq_len, embedding_dim] # LSTM输出 lstm_out, _ = self.lstm(embedded) # [batch_size, seq_len, hidden_dim*2] # 注意力权重计算 attn_weights = self.attention(lstm_out) # [batch_size, seq_len, 1] # 加权求和得到上下文向量 context = torch.bmm(attn_weights.transpose(1, 2), lstm_out).squeeze(1) # [batch_size, hidden_dim*2] # 多层全连接 output = self.dropout(self.fc1(context)) output = self.fc2(output) # [batch_size, output_dim] return output # 3. 训练配置 - 优化器与调度器 def train_model(model_id=0): # 初始化模型 model = AttentionLSTM( vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, pretrained_embedding, LSTM_LAYERS ).to(device) # 优化器 - 使用调整后的权重衰减 optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) # 学习率调度器 - 改进版 total_steps = len(train_loader) * EPOCHS warmup_steps = int(total_steps * 0.15) # 增加warmup比例 def warmup_cosine(current_step): if current_step < warmup_steps: return float(current_step) / float(max(1, warmup_steps)) progress = float(current_step - warmup_steps) / float(max(1, total_steps - warmup_steps)) return max(0.0, 0.5 * (1.0 + np.cos(np.pi * progress))) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=warmup_cosine) # 早停机制 - 更严格的配置 early_stopping = EarlyStopping(patience=7, delta=0.005, verbose=True, path=f'best_model_{model_id}.pt') # 训练循环 train_losses = [] val_losses = [] val_accs = [] print(f"\n开始训练模型 {model_id + 1}...") start_time = time.time() for epoch in range(EPOCHS): # 训练阶段 model.train() train_loss = 0.0 train_correct = 0 train_total = 0 for i, (texts, labels) in enumerate(train_loader): optimizer.zero_grad() outputs = model(texts).squeeze(1) loss = criterion(outputs, labels.float()) loss = loss / accumulation_steps # 梯度累积 loss.backward() # 梯度裁剪(防止梯度爆炸) nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # 累积一定步数后更新参数 if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad() scheduler.step() # 更新学习率 train_loss += loss.item() * accumulation_steps preds = torch.round(torch.sigmoid(outputs)) train_total += labels.size(0) train_correct += (preds == labels).sum().item() # 计算训练准确率 train_acc = train_correct / train_total # 验证阶段 model.eval() val_loss = 0.0 val_correct = 0 val_total = 0 with torch.no_grad(): for texts, labels in val_loader: outputs = model(texts).squeeze(1) loss = criterion(outputs, labels.float()) val_loss += loss.item() preds = torch.round(torch.sigmoid(outputs)) val_total += labels.size(0) val_correct += (preds == labels).sum().item() # 计算平均损失和准确率 avg_train_loss = train_loss / len(train_loader) avg_val_loss = val_loss / len(val_loader) val_acc = val_correct / val_total # 记录结果 train_losses.append(avg_train_loss) val_losses.append(avg_val_loss) val_accs.append(val_acc) # 打印当前学习率 current_lr = optimizer.param_groups[0]['lr'] print(f"Epoch: {epoch + 1}/{EPOCHS}, LR: {current_lr:.7f}, " f"Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}, " f"Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.4f}") # 早停判断 early_stopping(avg_val_loss, model) if early_stopping.early_stop: print("Early stopping") break total_time = time.time() - start_time print(f"模型 {model_id + 1} 训练完成,总耗时: {total_time:.2f}秒") return { 'model': model, 'train_losses': train_losses, 'val_losses': val_losses, 'val_accs': val_accs, 'best_val_loss': early_stopping.val_loss_min, 'best_val_acc': max(val_accs) } # 早停机制 class EarlyStopping: def __init__(self, patience=5, verbose=False, delta=0.0, path='best_model.pt'): self.patience = patience self.verbose = verbose self.counter = 0 self.best_score = None self.early_stop = False self.val_loss_min = np.Inf self.delta = delta self.path = path def __call__(self, val_loss, model): score = -val_loss if self.best_score is None: self.best_score = score self.save_checkpoint(val_loss, model) elif score < self.best_score + self.delta: self.counter += 1 print(f'EarlyStopping counter: {self.counter} out of {self.patience}') if self.counter >= self.patience: self.early_stop = True else: self.best_score = score self.save_checkpoint(val_loss, model) self.counter = 0 def save_checkpoint(self, val_loss, model): if self.verbose: print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') torch.save(model.state_dict(), self.path) self.val_loss_min = val_loss # 集成模型预测 def ensemble_predict(models, texts): probs = [] for model in models: model.eval() with torch.no_grad(): output = model(texts).squeeze(1) probs.append(torch.sigmoid(output)) # 平均概率 avg_prob = torch.mean(torch.stack(probs), dim=0) return torch.round(avg_prob) # 训练多个模型进行集成 criterion = nn.BCEWithLogitsLoss() num_models = 3 # 集成3个模型 models_info = [] for i in range(num_models): model_info = train_model(i) models_info.append(model_info) # 加载最佳模型并在测试集上评估 ensemble_models = [] for i in range(num_models): model = AttentionLSTM( vocab_size, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, pretrained_embedding, LSTM_LAYERS ).to(device) model.load_state_dict(torch.load(f'best_model_{i}.pt')) ensemble_models.append(model) # 测试集评估 test_correct = 0 test_total = 0 all_preds = [] all_labels = [] with torch.no_grad(): for texts, labels in test_loader: preds = ensemble_predict(ensemble_models, texts) test_total += labels.size(0) test_correct += (preds == labels).sum().item() all_preds.extend(preds.cpu().numpy()) all_labels.extend(labels.cpu().numpy()) test_accuracy = test_correct / test_total print(f"\n集成模型最终测试准确率: {test_accuracy:.4f}") # 5. 可视化 plt.figure(figsize=(18, 5)) # 损失曲线 plt.subplot(1, 3, 1) for i, info in enumerate(models_info): plt.plot(range(1, len(info['train_losses']) + 1), info['train_losses'], f'{i + 1}-', label=f'Train Loss {i + 1}') plt.plot(range(1, len(info['val_losses']) + 1), info['val_losses'], f'{i + 1}--', label=f'Val Loss {i + 1}') plt.title('Training and Validation Loss') plt.xlabel('Epochs') plt.ylabel('Loss') plt.legend() plt.grid(True) # 准确率曲线 plt.subplot(1, 3, 2) for i, info in enumerate(models_info): plt.plot(range(1, len(info['val_accs']) + 1), info['val_accs'], f'{i + 1}-', label=f'Val Acc {i + 1}') plt.axhline(y=test_accuracy, color='purple', linestyle='--', label=f'Test Accuracy: {test_accuracy:.4f}') plt.title('Validation and Test Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.grid(True) # 混淆矩阵 plt.subplot(1, 3, 3) cm = confusion_matrix(all_labels, all_preds) plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues) plt.title('Confusion matrix') plt.colorbar() classes = ['Negative', 'Positive'] tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) # 在混淆矩阵上标注数字 thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.ylabel('True label') plt.xlabel('Predicted label') plt.tight_layout() plt.savefig('training_metrics.png') # 保存图表 plt.show() # 打印分类报告 report = classification_report(all_labels, all_preds, target_names=['Negative', 'Positive']) print("\n分类报告:") print(report) # 单个样本测试 def predict_sentiment(models, vocab, tokenizer, sentence, max_len=MAX_SEQ_LEN): tokens = tokenizer(sentence) indexed = [vocab[t] for t in tokens] tensor = torch.tensor(indexed).unsqueeze(0).to(device) # [1, seq_len] # 动态长度处理 dynamic_len = min(len(indexed) + 25, max_len) tensor = torch.nn.utils.rnn.pad_sequence([tensor.squeeze(0)], batch_first=True, padding_value=vocab["<pad>"]).to(device) # 集成预测 probs = [] for model in models: model.eval() with torch.no_grad(): output = model(tensor).squeeze(1) probs.append(torch.sigmoid(output).item()) avg_prob = sum(probs) / len(probs) return "Positive" if avg_prob > 0.5 else "Negative", avg_prob # 测试示例 test_sentences = [ "This movie is amazing! I love it.", "Terrible film, waste of money and time.", "The plot was confusing but the visuals were stunning.", "I've seen better, but it wasn't terrible.", "Absolutely fantastic - one of the best I've seen!", "The acting was good, but the story had no depth." ] print("\n样本测试结果:") for sentence in test_sentences: sentiment, prob = predict_sentiment(ensemble_models, vocab, tokenizer, sentence) print(f"影评: {sentence}") print(f"情感: {sentiment}, 置信度: {prob:.4f}") print("-" * 50) # 保存模型和词汇表 for i, model in enumerate(ensemble_models): torch.save(model.state_dict(), f'ensemble_model_{i}.pt') torch.save(vocab, 'vocab.pth') print("\n模型和词汇表已保存!") # 预测自定义影评函数 def predict_custom_review(models): print("\n输入影评文本 (输入'q'退出):") while True: review = input("> ") if review.lower() == 'q': break sentiment, prob = predict_sentiment(models, vocab, tokenizer, review) print(f"情感: {sentiment}, 置信度: {prob:.4f}") print("-" * 50) # 启动交互式预测 predict_custom_review(ensemble_models) 这段代码报错:C:\anaconda\envs\py39\python.exe C:\Users\32398\PyCharmMiscProject\深度学习\期末.py C:\anaconda\envs\py39\lib\site-packages\torchtext\datasets\__init__.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG) C:\anaconda\envs\py39\lib\site-packages\torchtext\data\__init__.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG) C:\anaconda\envs\py39\lib\site-packages\torchdata\datapipes\__init__.py:18: UserWarning: ################################################################################ WARNING! The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a future torchdata release! Please see https://github.com/pytorch/data/issues/1196 to learn more and leave feedback. ################################################################################ deprecation_warning() C:\anaconda\envs\py39\lib\site-packages\torchtext\vocab\__init__.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG) C:\anaconda\envs\py39\lib\site-packages\torchtext\utils.py:4: UserWarning: /!\ IMPORTANT WARNING ABOUT TORCHTEXT STATUS /!\ Torchtext is deprecated and the last released version will be 0.18 (this one). You can silence this warning by calling the following at the beginnign of your scripts: `import torchtext; torchtext.disable_torchtext_deprecation_warning()` warnings.warn(torchtext._TORCHTEXT_DEPRECATION_MSG) ImportError: DLL load failed while importing _multiarray_umath: 找不到指定的模块。 Traceback (most recent call last): File "C:\Users\32398\PyCharmMiscProject\深度学习\期末.py", line 13, in <module> import matplotlib.pyplot as plt File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\__init__.py", line 161, in <module> from . import _api, _version, cbook, _docstring, rcsetup File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\rcsetup.py", line 27, in <module> from matplotlib.colors import Colormap, is_color_like File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\colors.py", line 56, in <module> from matplotlib import _api, _cm, cbook, scale File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\scale.py", line 22, in <module> from matplotlib.ticker import ( File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\ticker.py", line 138, in <module> from matplotlib import transforms as mtransforms File "C:\anaconda\envs\py39\lib\site-packages\matplotlib\transforms.py", line 49, in <module> from matplotlib._path import ( ImportError: numpy.core.multiarray failed to import 进程已结束,退出代码为 1
06-20
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值