解决faster-rcnn中训练时assert(boxes[:,2]>=boxes[:,0]).all()的问题

本文介绍了一种在目标检测任务中处理翻转图像时边界框坐标异常的方法。通过修改lib/datasets/imdb.py文件中的append_flipped_images()函数,加入对翻转后边界框坐标的检查与修正,确保所有边界框正确表示物体位置。

修改lib/datasets/imdb.py,append_flipped_images()函数
数据整理,在一行代码为 boxes[:, 2] = widths[i] - oldx1 - 1下加入代码:
for b in range(len(boxes)):
if boxes[b][2]< boxes[b][0]:
boxes[b][0] = 0

import numpy as np import torch import torchvision from torch.optim import lr_scheduler from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN from torchvision.models.detection.backbone_utils import resnet_fpn_backbone from torchvision.models.detection.rpn import AnchorGenerator from torchvision.transforms import Compose, ToTensor, Normalize from torch.utils.data import DataLoader import torch.optim as optim from pycocotools.coco import COCO from pycocotools.cocoeval import COCOeval import os import matplotlib.pyplot as plt import cv2 from torch.utils.data import Dataset os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # ===================== 1. 修正后的CocoDataset ===================== class CocoDataset(Dataset): def __init__(self, img_dir, ann_file, transform=None): self.img_dir = img_dir self.coco = COCO(ann_file) self.img_ids = self.coco.getImgIds() self.transform = transform self.resize_size = (640, 640) def resize_image(self, img, target): orig_h, orig_w = img.shape[:2] new_w, new_h = self.resize_size # 等比例缩放+补边,避免小目标变形 scale = min(new_w / orig_w, new_h / orig_h) resize_w = int(orig_w * scale) resize_h = int(orig_h * scale) img = cv2.resize(img, (resize_w, resize_h)) pad_w = (new_w - resize_w) // 2 pad_h = (new_h - resize_h) // 2 img = cv2.copyMakeBorder(img, pad_h, pad_h, pad_w, pad_w, cv2.BORDER_CONSTANT, value=0) # 修正标注框 if len(target['boxes']) > 0: boxes = target['boxes'].numpy() boxes[:, [0, 2]] *= scale boxes[:, [1, 3]] *= scale boxes[:, [0, 2]] += pad_w boxes[:, [1, 3]] += pad_h valid = (boxes[:, 2] - boxes[:, 0] > 1) & (boxes[:, 3] - boxes[:, 1] > 1) valid &= (boxes[:, 0] >= 0) & (boxes[:, 1] >= 0) & (boxes[:, 2] <= new_w) & (boxes[:, 3] <= new_h) boxes = boxes[valid] target['boxes'] = torch.as_tensor(boxes, dtype=torch.float32) target['labels'] = target['labels'][valid] target['height'] = torch.tensor(new_h) target['width'] = torch.tensor(new_w) return img, target def __getitem__(self, idx): img_id = self.img_ids[idx] img_info = self.coco.loadImgs(img_id)[0] img_path = f"{self.img_dir}/{img_info['file_name']}" img = cv2.imread(img_path) if img is None: raise FileNotFoundError(f"图像不存在:{img_path}") img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) ann_ids = self.coco.getAnnIds(imgIds=img_id) anns = self.coco.loadAnns(ann_ids) boxes, labels = [], [] for ann in anns: if ann['category_id'] != 1: continue x1, y1, w, h = ann['bbox'] if w <= 0 or h <= 0: continue x2 = x1 + w y2 = y1 + h boxes.append([x1, y1, x2, y2]) labels.append(1) boxes = torch.as_tensor(boxes, dtype=torch.float32) if boxes else torch.empty((0, 4)) labels = torch.as_tensor(labels, dtype=torch.int64) if labels else torch.empty(0) target = {'boxes': boxes, 'labels': labels, 'image_id': torch.tensor([img_id])} img, target = self.resize_image(img, target) if self.transform: img = self.transform(img) return img, target def __len__(self): return len(self.img_ids) # ===================== 2. 可视化函数(修正反归一化) ===================== def visualize_predictions(image, targets, outputs, idx=0): img = image[idx].permute(1, 2, 0).cpu().numpy() img = (img * np.array([0.229, 0.224, 0.225])) + np.array([0.485, 0.456, 0.406]) img = np.clip(img, 0, 1) fig, ax = plt.subplots(1, 1, figsize=(12, 12)) ax.imshow(img) # 绘制真实框 gt_boxes = targets[idx]['boxes'].cpu() for box in gt_boxes: x1, y1, x2, y2 = box rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, color='green', linewidth=2) ax.add_patch(rect) # 绘制预测框 if len(outputs) > idx: pred = outputs[idx] keep = pred['scores'] > 0.1 # 降低阈值,显示更多小目标 pred_boxes = pred['boxes'][keep].cpu() pred_labels = pred['labels'][keep].cpu() pred_scores = pred['scores'][keep].cpu() for box, label, score in zip(pred_boxes, pred_labels, pred_scores): x1, y1, x2, y2 = box rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, color='red', linewidth=2) ax.text(x1, y1, f'{label}:{score:.2f}', color='red', fontsize=12) ax.add_patch(rect) plt.title("Green=GT, Red=Pred") plt.axis('off') plt.show() # ===================== 3.训练逻辑(无权重加载) ===================== os.makedirs('checkpoints', exist_ok=True) # 数据加载 transform = Compose([ToTensor(), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) train_dataset = CocoDataset( img_dir=r'D:\Yolov8\coco_dataset1\images\train', ann_file=r'D:\Yolov8\coco_dataset1\annotations\train.json', transform=transform ) val_dataset = CocoDataset( img_dir=r'D:\Yolov8\coco_dataset1\images\val', ann_file=r'D:\Yolov8\coco_dataset1\annotations\val.json', transform=transform ) def collate_fn(batch): return tuple(zip(*batch)) train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn) val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn) backbone = resnet_fpn_backbone('resnet50', pretrained=True) # 模型初始化(自定义小目标锚框,仅主干网络预训练) num_classes = 2 anchor_sizes = ((16, 32, 64),) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) anchor_generator = AnchorGenerator(sizes=anchor_sizes, aspect_ratios=aspect_ratios) # 关键:仅主干网络加载预训练,RPN层随机初始化(无参数冲突) model = FasterRCNN( backbone = backbone, num_classes = num_classes, rpn_anchor_generator=anchor_generator, min_size=640, max_size=640, box_nms_thresh=0.5, box_score_thresh=0.05, ) in_features = model.roi_heads.box_predictor.cls_score.in_features model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = model.to(device) print(f"使用设备:{device}") # 优化器(统一学习率,适配小目标) optimizer = optim.SGD( model.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4, nesterov=True ) scheduler = lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5) # 更频繁的学习率衰减 # 训练循环(20轮,确保小目标特征学习充分) num_epochs = 5 for epoch in range(num_epochs): model.train() total_loss = 0.0 total_cls_loss = 0.0 total_reg_loss = 0.0 for batch_idx, (images, targets) in enumerate(train_loader): images = [img.to(device) for img in images] targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) optimizer.zero_grad() losses.backward() optimizer.step() total_loss += losses.item() total_cls_loss += loss_dict['loss_classifier'].item() total_reg_loss += loss_dict['loss_box_reg'].item() if (batch_idx + 1) % 10 == 0: print( f"Epoch [{epoch + 1}/{num_epochs}], " f"Batch [{batch_idx + 1}/{len(train_loader)}], " f"Loss: {losses.item():.4f}, " f"Clf Loss: {loss_dict['loss_classifier'].item():.4f}, " f"Reg Loss: {loss_dict['loss_box_reg'].item():.4f}" ) avg_loss = total_loss / len(train_loader) avg_cls = total_cls_loss / len(train_loader) avg_reg = total_reg_loss / len(train_loader) print(f"Epoch {epoch + 1} - Avg Loss: {avg_loss:.4f}, Cls: {avg_cls:.4f}, Reg: {avg_reg:.4f}") scheduler.step() # 保存模型(仅保存当前训练的权重,无冲突) torch.save(model.state_dict(), f'checkpoints/faster_rcnn_epoch_{epoch + 1}.pth') # ===================== 4. 评估+可视化 ===================== print("\n开始评估模型...") model.eval() results = [] # 可视化验证集第一个样本 print("可视化验证集预测结果...") images, targets = next(iter(val_loader)) images = [img.to(device) for img in images] with torch.no_grad(): outputs = model(images) for output in outputs: print("预测标签:", output['labels'].cpu().numpy()) print("预测置信度:", output['scores'].cpu().numpy()) print("预测框坐标:", output['boxes'].cpu().numpy()) visualize_predictions(images, targets, outputs, idx=0) # 生成评估结果 with torch.no_grad(): for images, targets in val_loader: images = [img.to(device) for img in images] outputs = model(images) for output, target in zip(outputs, targets): img_id = target['image_id'].item() boxes = output['boxes'].cpu().numpy() scores = output['scores'].cpu().numpy() labels = output['labels'].cpu().numpy() for box, score, label in zip(boxes, scores, labels): if score < 0.01: # 极低阈值,保留所有可能框 continue x1, y1, x2, y2 = box w = x2 - x1 h = y2 - y1 if w <= 0 or h <= 0: continue results.append({ 'image_id': img_id, 'category_id': int(label), 'bbox': [float(x1), float(y1), float(w), float(h)], 'score': float(score) }) if len(results) == 0: print("⚠️ 警告:无有效预测结果!") else: cocoGt = val_dataset.coco if 'info' not in cocoGt.dataset: cocoGt.dataset['info'] = {'version': '1.0'} try: cocoDt = cocoGt.loadRes(results) cocoEval = COCOeval(cocoGt, cocoDt, iouType='bbox') cocoEval.evaluate() cocoEval.accumulate() cocoEval.summarize() print("✅ 评估完成") print(f"mAP@0.5:0.95 = {cocoEval.stats[0]:.4f}") print(f"mAP@0.5 = {cocoEval.stats[1]:.4f}") print(f"Recall@0.5 = {cocoEval.stats[6]:.4f}") except Exception as e: print("❌ 评估失败:", str(e)) C:\Users\YangGuang\.conda\envs\pytorch\python.exe D:\Yolov8\.github\Faster-Rcnn\faster-rcnn.py loading annotations into memory... Done (t=0.00s) creating index... index created! loading annotations into memory... Done (t=0.00s) creating index... index created! C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\_utils.py:135: UserWarning: Using 'backbone_name' as positional parameter(s) is deprecated since 0.13 and may be removed in the future. Please use keyword parameter(s) instead. warnings.warn( C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\_utils.py:208: UserWarning: The parameter 'pretrained' is deprecated since 0.13 and may be removed in the future, please use 'weights' instead. warnings.warn( C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\_utils.py:223: UserWarning: Arguments other than a weight enum or `None` for 'weights' are deprecated since 0.13 and may be removed in the future. The current behavior is equivalent to passing `weights=ResNet50_Weights.IMAGENET1K_V1`. You can also use `weights=ResNet50_Weights.DEFAULT` to get the most up-to-date weights. warnings.warn(msg) C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\cuda\__init__.py:235: UserWarning: NVIDIA GeForce RTX 5070 Laptop GPU with CUDA capability sm_120 is not compatible with the current PyTorch installation. The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90 compute_37. If you want to use the NVIDIA GeForce RTX 5070 Laptop GPU GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/ warnings.warn( 使用设备:cuda Traceback (most recent call last): File "D:\Yolov8\.github\Faster-Rcnn\faster-rcnn.py", line 204, in <module> loss_dict = model(images, targets) ^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\detection\generalized_rcnn.py", line 104, in forward proposals, proposal_losses = self.rpn(images, features, targets) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\detection\rpn.py", line 362, in forward anchors = self.anchor_generator(images, features) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl return self._call_impl(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl return forward_call(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\detection\anchor_utils.py", line 127, in forward anchors_over_all_feature_maps = self.grid_anchors(grid_sizes, strides) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torchvision\models\detection\anchor_utils.py", line 88, in grid_anchors torch._assert( File "C:\Users\YangGuang\.conda\envs\pytorch\Lib\site-packages\torch\__init__.py", line 2041, in _assert assert condition, message ^^^^^^^^^ AssertionError: Anchors should be Tuple[Tuple[int]] because each feature map could potentially have different sizes and aspect ratios. There needs to be a match between the number of feature maps passed and the number of sizes / aspect ratios specified. 进程已结束,退出代码为 1
最新发布
12-09
你遇到的错误是: ``` AssertionError: Anchors should be Tuple[Tuple[int]] because each feature map could potentially have different sizes and aspect ratios. There needs to be a match between the number of feature maps passed and the number of sizes / aspect ratios specified. ``` ### 🔍 错误原因分析 这个错误来自 `torchvision` 的 `AnchorGenerator` 类。你的锚框设置如下: ```python anchor_sizes = ((16, 32, 64),) aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) ``` 虽然语法上看似正确,但问题在于: - `resnet_fpn_backbone('resnet50', pretrained=True)` 输出 **5 层特征图**(C2, C3, C4, C5, P6?不,FPN 通常是 P2-P5 或 C3-C5 + FPN 输出)。 - 实际上,ResNet-FPN 返回的是一个包含多个尺度的特征字典(通常为 `"0"`, `"1"`, `"2"`, `"3"`, `"pool"`),而 `AnchorGenerator` 默认会为每一个特征层生成锚点。 - 但是你只给了一组 `anchor_sizes`(即 `(16, 32, 64)`),它会被广播成所有层级使用这一组大小 —— 这没问题。 然而,真正的问题是:**`anchor_sizes` 和 `aspect_ratios` 必须是一个与输出特征层数量完全匹配的嵌套元组结构**。 FPN 在 Faster R-CNN 中默认有 **5 个空间分辨率不同的特征层**(对应于 `P2`, `P3`, `P4`, `P5`, `P6` 不一定都有;实际是来自 C3 到 C5 上构造的 FPN 特征层,共 5 层?其实标准是 5 层?我们来确认一下)。 实际上,在 `torchvision` 中,`resnet_fpn_backbone` 构建的 FPN 通常输出 **5 个特征层**(key 为 `'0'`, `'1'`, `'2'`, `'3'`, `'pool'`),但用于 RPN 的一般是前 5 层或其中 4~5 层。 而 `AnchorGenerator` 要求: > `sizes` 和 `aspect_ratios` 都必须是长度等于特征层数的 tuple,每一项是一组 (size,)(ar,)--- ## ✅ 正确解决方案 你需要将 `anchor_sizes` 和 `aspect_ratios` 设置为 **每个特征层都有一组 anchor size**。 ### ✅ 修改方式如下: ```python # 假设有 5 个特征层(这是 ResNet-FPN 的常见情况) # 每一层使用相同的 anchor 大小组合,但要显式地写成 5 个 tuple anchor_sizes = ( (16,), # P2 (32,), # P3 (64,), # P4 (128,), # P5 (256,), # P6 (如果存在) ) # 或者更合理的小目标检测设计:多尺度 small anchors anchor_sizes = tuple((s,) for s in [16, 32, 64, 128, 256]) # 每层一个基础尺寸 aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes) # 每层都用这三个比例 ``` 然后创建 `AnchorGenerator`: ```python anchor_generator = AnchorGenerator(sizes=anchor_sizes, aspect_ratios=aspect_ratios) ``` 这样就能通过类型和维度检查了。 --- ### 🛠️ 完整修正代码段(替换原模型构建部分) ```python # 使用 ResNet50 + FPN 主干网络 backbone = resnet_fpn_backbone('resnet50', pretrained=True) # 自定义 Anchor Generator,适配 5 个特征层 # 更适合小目标检测:底层(P2/P3)用更小的 anchor anchor_sizes = ( (16,), # P2 (high resolution, small objects) (32,), # P3 (64,), # P4 (128,), # P5 (256,), # P6 ) aspect_ratios = ((0.5, 1.0, 2.0),) * 5 # 每层三个比例 anchor_generator = AnchorGenerator(sizes=anchor_sizes, aspect_ratios=aspect_ratios) # 创建 Faster R-CNN 模型 model = FasterRCNN( backbone=backbone, num_classes=num_classes, rpn_anchor_generator=anchor_generator, min_size=640, max_size=640, box_nms_thresh=0.5, box_score_thresh=0.05, ) ``` --- ### 💡 补充说明 1. **为什么之前 `( (16,32,64), )` 不行?** 因为你传的是 `((16,32,64),)` —— 一个只有一个元素的 tuple,每个元素是一个 tuple。这表示“只有 1 层特征图”,但实际你有 5 层 → 维度不匹配! 2. **如何知道有多少特征层?** 可以打印看看: ```python from torchvision.models.detection.backbone_utils import resnet_fpn_backbone import torch backbone = resnet_fpn_backbone('resnet50', pretrained=False) x = torch.randn(1, 3, 640, 640) features = backbone(x) print("Feature keys:", list(features.keys())) # 应该是 ['0', '1', '2', '3', 'pool'] print("Number of feature levels:", len(features)) ``` 输出一般是: ``` Feature keys: ['0', '1', '2', '3', 'pool'] Number of feature levels: 5 ``` 所以你需要 **5 组 anchor sizes**。 3. **关于小目标建议优化** - 如果专注小目标,可以在浅层(如 P2/P3)加入更小的 anchor,比如 `(8,)`、`(16,)` - 示例: ```python anchor_sizes = ( (8,), # P2 - very small (16,), # P3 - small (32,), # P4 - medium (64,), # P5 - large (128,), # P6 - very large ) ``` --- ### ✅ 总结修复步骤 | 问题 | 解决方法 | |------|---------| | AnchorGenerator 断言失败 | 显式提供与特征层数一致的 `sizes` 元组 | | 少写了层级数量 | 改为 `tuple((s,) for s in [...])`,共 5 层 | | 小目标表现差 | 浅层加小 anchor,如 8x8 | --- ### ✅ 最终推荐配置(适合小目标) ```python anchor_sizes = ( (8,), # P2 (16,), # P3 (32,), # P4 (64,), # P5 (128,), # P6 ) aspect_ratios = ((0.5, 1.0, 2.0),) * 5 # 每层保持三种形状 anchor_generator = AnchorGenerator(sizes=anchor_sizes, aspect_ratios=aspect_ratios) ``` --- ### ✅ 附加建议:警告处理(非致命) 你看到这些警告也可以顺手改掉: ```bash UserWarning: Using 'backbone_name' as positional parameter(s) is deprecated since 0.13 ... ``` 改为关键字调用: ```python backbone = resnet_fpn_backbone(backbone_name='resnet50', pretrained=True) ``` 或者更好(新版本推荐): ```python import torchvision.models as models # 替代方案:直接用预训练主干 weights = models.ResNet50_Weights.IMAGENET1K_V1 backbone = resnet_fpn_backbone('resnet50', weights=weights) ``` --- 现在你可以运行训练了,不会再出现 `AssertionError`。 ---
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

程序员阿明

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值