SAM3部署
主要介绍图片分割的部署,视频可以进行参考
GitHub:https://github.com/facebookresearch/sam3/tree/main?tab=readme-ov-file
论文:https://ai.meta.com/research/publications/sam-3-segment-anything-with-concepts/
先安装依赖
先决条件
- Python 3.12 或更高版本
- PyTorch 2.7 或更高版本
- 兼容 CUDA 的 GPU,CUDA 版本为 12.6 或更高
conda create -n sam3 python=3.12
conda deactivate
conda activate sam3
- 安装支持 CUDA 的 PyTorch:
pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
- 克隆仓库并安装包:
git clone https://github.com/facebookresearch/sam3.git
cd sam3
pip install -e .
- 安装额外的依赖项,例如笔记本或开发:
# For running example notebooks
pip install -e ".[notebooks]"
# For development
pip install -e ".[train,dev]"
模型下载
由于官网需要申请,所以可以去魔塔社区下载https://www.modelscope.cn/models/facebook/sam3/summary
找到sam3.pt
部署
接下来进行模型的部署
train.py
import torch
#################################### For Image ####################################
from PIL import Image
from sam3.model_builder import build_sam3_image_model
from sam3.model.sam3_image_processor import Sam3Processor
from sam3.visualization_utils import draw_box_on_image, normalize_bbox, plot_results
# Load the model
model = build_sam3_image_model()
processor = Sam3Processor(model)
# Load an image
image = Image.open("your_images.jpg")# 需修改
inference_state = processor.set_image(image)
# Prompt the model with text
output = processor.set_text_prompt(state=inference_state, prompt="name") # 需修改
# Get the masks, bounding boxes, and scores
masks, boxes, scores = output["masks"], output["boxes"], output["scores"]
plot_results(image, inference_state, output_file="your/sam3/your_images.jpg")# 需修改
由于云服务器上进行部署不方便,所以一般采用自己下载模型后上传,所以在model_builder.py中要对部分代码进行修改,让代码直接使用本地模型,这里为了方便直接贴出代码。(路径在sam3/sam3/)
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import os
from typing import Optional
import torch
import torch.nn as nn
from huggingface_hub import hf_hub_download
from iopath.common.file_io import g_pathmgr
from sam3.model.decoder import (
TransformerDecoder,
TransformerDecoderLayer,
TransformerDecoderLayerv2,
TransformerEncoderCrossAttention,
)
from sam3.model.encoder import TransformerEncoderFusion, TransformerEncoderLayer
from sam3.model.geometry_encoders import SequenceGeometryEncoder
from sam3.model.maskformer_segmentation import PixelDecoder, UniversalSegmentationHead
from sam3.model.memory import (
CXBlock,
SimpleFuser,
SimpleMaskDownSampler,
SimpleMaskEncoder,
)
from sam3.model.model_misc import (
DotProductScoring,
MLP,
MultiheadAttentionWrapper as MultiheadAttention,
TransformerWrapper,
)
from sam3.model.necks import Sam3DualViTDetNeck
from sam3.model.position_encoding import PositionEmbeddingSine
from sam3.model.sam1_task_predictor import SAM3InteractiveImagePredictor
from sam3.model.sam3_image import Sam3Image, Sam3ImageOnVideoMultiGPU
from sam3.model.sam3_tracking_predictor import Sam3TrackerPredictor
from sam3.model.sam3_video_inference import Sam3VideoInferenceWithInstanceInteractivity
from sam3.model.sam3_video_predictor import Sam3VideoPredictorMultiGPU
from sam3.model.text_encoder_ve import VETextEncoder
from sam3.model.tokenizer_ve import SimpleTokenizer
from sam3.model.vitdet import ViT
from sam3.model.vl_combiner import SAM3VLBackbone
from sam3.sam.transformer import RoPEAttention
# Setup TensorFloat-32 for Ampere GPUs if available
def _setup_tf32() -> None:
"""Enable TensorFloat-32 for Ampere GPUs if available."""
if torch.cuda.is_available():
device_props = torch.cuda.get_device_properties(0)
if device_props.major >= 8:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
_setup_tf32()
def _create_position_encoding(precompute_resolution=None):
"""Create position encoding for visual backbone."""
return PositionEmbeddingSine(
num_pos_feats=256,
normalize=True,
scale=None,
temperature=10000,
precompute_resolution=precompute_resolution,
)
def _create_vit_backbone(compile_mode=None):
"""Create ViT backbone for visual feature extraction."""
return ViT(
img_size=1008,
pretrain_img_size=336,
patch_size=14,
embed_dim=1024,
depth=32,
num_heads=16,
mlp_ratio=4.625,
norm_layer="LayerNorm",
drop_path_rate=0.1,
qkv_bias=True,
use_abs_pos=True,
tile_abs_pos=True,
global_att_blocks=(7, 15, 23, 31),
rel_pos_blocks=(),
use_rope=True,
use_interp_rope=True,
window_size=24,
pretrain_use_cls_token=True,
retain_cls_token=False,
ln_pre=True,
ln_post=False,
return_interm_layers=False,
bias_patch_embed=False,
compile_mode=compile_mode,
)
def _create_vit_neck(position_encoding, vit_backbone, enable_inst_interactivity=False):
"""Create ViT neck for feature pyramid."""
return Sam3DualViTDetNeck(
position_encoding=position_encoding,
d_model=256,
scale_factors=[4.0, 2.0, 1.0, 0.5],
trunk=vit_backbone,
add_sam2_neck=enable_inst_interactivity,
)
def _create_vl_backbone(vit_neck, text_encoder):
"""Create visual-language backbone."""
return SAM3VLBackbone(visual=vit_neck, text=text_encoder, scalp=1)
def _create_transformer_encoder() -> TransformerEncoderFusion:
"""Create transformer encoder with its layer."""
encoder_layer = TransformerEncoderLayer(
activation="relu",
d_model=256,
dim_feedforward=2048,
dropout=0.1,
pos_enc_at_attn=True,
pos_enc_at_cross_attn_keys=False,
pos_enc_at_cross_attn_queries=False,
pre_norm=True,
self_attention=MultiheadAttention(
num_heads=8,
dropout=0.1,
embed_dim=256,
batch_first=True,
),
cross_attention=MultiheadAttention(
num_heads=8,
dropout=0.1,
embed_dim=256,
batch_first=True,
),
)
encoder = TransformerEncoderFusion(
layer=encoder_layer,
num_layers=6,
d_model=256,
num_feature_levels=1,
frozen=False,
use_act_checkpoint=True,
add_pooled_text_to_img_feat=False,
pool_text_with_mask=True,
)
return encoder
def _create_transformer_decoder() -> TransformerDecoder:
"""Create transformer decoder with its layer."""
decoder_layer = TransformerDecoderLayer(
activation="relu",
d_model=256,
dim_feedforward=2048,
dropout=0.1,
cross_attention=MultiheadAttention(
num_heads=8,
dropout=0.1,
embed_dim=256,
),
n_heads=8,
use_text_cross_attention=True,
)
decoder = TransformerDecoder(
layer=decoder_layer,
num_layers=6,
num_queries=200,
return_intermediate=True,
box_refine=True,
num_o2m_queries=0,
dac=True,
boxRPB="log",
d_model=256,
frozen=False,
interaction_layer=None,
dac_use_selfatt_ln=True,
resolution=1008,
stride=14,
use_act_checkpoint=True,
presence_token=True,
)
return decoder
def _create_dot_product_scoring():
"""Create dot product scoring module."""
prompt_mlp = MLP(
input_dim=256,
hidden_dim=2048,
output_dim=256,
num_layers=2,
dropout=0.1,
residual=True,
out_norm=nn.LayerNorm(256),
)
return DotProductScoring(d_model=256, d_proj=256, prompt_mlp=prompt_mlp)
def _create_segmentation_head(compile_mode=None):
"""Create segmentation head with pixel decoder."""
pixel_decoder = PixelDecoder(
num_upsampling_stages=3,
interpolation_mode="nearest",
hidden_dim=256,
compile_mode=compile_mode,
)
cross_attend_prompt = MultiheadAttention(
num_heads=8,
dropout=0,
embed_dim=256,
)
segmentation_head = UniversalSegmentationHead(
hidden_dim=256,
upsampling_stages=3,
aux_masks=False,
presence_head=False,
dot_product_scorer=None,
act_ckpt=True,
cross_attend_prompt=cross_attend_prompt,
pixel_decoder=pixel_decoder,
)
return segmentation_head
def _create_geometry_encoder():
"""Create geometry encoder with all its components."""
# Create position encoding for geometry encoder
geo_pos_enc = _create_position_encoding()
# Create CX block for fuser
cx_block = CXBlock(
dim=256,
kernel_size=7,
padding=3,
layer_scale_init_value=1.0e-06,
use_dwconv=True,
)
# Create geometry encoder layer
geo_layer = TransformerEncoderLayer(
activation="relu",
d_model=256,
dim_feedforward=2048,
dropout=0.1,
pos_enc_at_attn=False,
pre_norm=True,
self_attention=MultiheadAttention(
num_heads=8,
dropout=0.1,
embed_dim=256,
batch_first=False,
),
pos_enc_at_cross_attn_queries=False,
pos_enc_at_cross_attn_keys=True,
cross_attention=MultiheadAttention(
num_heads=8,
dropout=0.1,
embed_dim=256,
batch_first=False,
),
)
# Create geometry encoder
input_geometry_encoder = SequenceGeometryEncoder(
pos_enc=geo_pos_enc,
encode_boxes_as_points=False,
points_direct_project=True,
points_pool=True,
points_pos_enc=True,
boxes_direct_project=True,
boxes_pool=True,
boxes_pos_enc=True,
d_model=256,
num_layers=3,
layer=geo_layer,
use_act_ckpt=True,
add_cls=True,
add_post_encode_proj=True,
)
return input_geometry_encoder
def _create_sam3_model(
backbone,
transformer,
input_geometry_encoder,
segmentation_head,
dot_prod_scoring,
inst_interactive_predictor,
eval_mode,
):
"""Create the SAM3 image model."""
common_params = {
"backbone": backbone,
"transformer": transformer,
"input_geometry_encoder": input_geometry_encoder,
"segmentation_head": segmentation_head,
"num_feature_levels": 1,
"o2m_mask_predict": True,
"dot_prod_scoring": dot_prod_scoring,
"use_instance_query": False,
"multimask_output": True,
"inst_interactive_predictor": inst_interactive_predictor,
}
matcher = None
if not eval_mode:
from sam3.train.matcher import BinaryHungarianMatcherV2
matcher = BinaryHungarianMatcherV2(
focal=True,
cost_class=2.0,
cost_bbox=5.0,
cost_giou=2.0,
alpha=0.25,
gamma=2,
stable=False,
)
common_params["matcher"] = matcher
model = Sam3Image(**common_params)
return model
def _create_tracker_maskmem_backbone():
"""Create the SAM3 Tracker memory encoder."""
# Position encoding for mask memory backbone
position_encoding = PositionEmbeddingSine(
num_pos_feats=64,
normalize=True,
scale=None,
temperature=10000,
precompute_resolution=1008,
)
# Mask processing components
mask_downsampler = SimpleMaskDownSampler(
kernel_size=3, stride=2, padding=1, interpol_size=[1152, 1152]
)
cx_block_layer = CXBlock(
dim=256,
kernel_size=7,
padding=3,
layer_scale_init_value=1.0e-06,
use_dwconv=True,
)
fuser = SimpleFuser(layer=cx_block_layer, num_layers=2)
maskmem_backbone = SimpleMaskEncoder(
out_dim=64,
position_encoding=position_encoding,
mask_downsampler=mask_downsampler,
fuser=fuser,
)
return maskmem_backbone
def _create_tracker_transformer():
"""Create the SAM3 Tracker transformer components."""
# Self attention
self_attention = RoPEAttention(
embedding_dim=256,
num_heads=1,
downsample_rate=1,
dropout=0.1,
rope_theta=10000.0,
feat_sizes=[72, 72],
use_fa3=False,
use_rope_real=False,
)
# Cross attention
cross_attention = RoPEAttention(
embedding_dim=256,
num_heads=1,
downsample_rate=1,
dropout=0.1,
kv_in_dim=64,
rope_theta=10000.0,
feat_sizes=[72, 72],
rope_k_repeat=True,
use_fa3=False,
use_rope_real=False,
)
# Encoder layer
encoder_layer = TransformerDecoderLayerv2(
cross_attention_first=False,
activation="relu",
dim_feedforward=2048,
dropout=0.1,
pos_enc_at_attn=False,
pre_norm=True,
self_attention=self_attention,
d_model=256,
pos_enc_at_cross_attn_keys=True,
pos_enc_at_cross_attn_queries=False,
cross_attention=cross_attention,
)
# Encoder
encoder = TransformerEncoderCrossAttention(
remove_cross_attention_layers=[],
batch_first=True,
d_model=256,
frozen=False,
pos_enc_at_input=True,
layer=encoder_layer,
num_layers=4,
use_act_checkpoint=False,
)
# Transformer wrapper
transformer = TransformerWrapper(
encoder=encoder,
decoder=None,
d_model=256,
)
return transformer
def build_tracker(
apply_temporal_disambiguation: bool, with_backbone: bool = False, compile_mode=None
) -> Sam3TrackerPredictor:
"""
Build the SAM3 Tracker module for video tracking.
Returns:
Sam3TrackerPredictor: Wrapped SAM3 Tracker module
"""
# Create model components
maskmem_backbone = _create_tracker_maskmem_backbone()
transformer = _create_tracker_transformer()
backbone = None
if with_backbone:
vision_backbone = _create_vision_backbone(compile_mode=compile_mode)
backbone = SAM3VLBackbone(scalp=1, visual=vision_backbone, text=None)
# Create the Tracker module
model = Sam3TrackerPredictor(
image_size=1008,
num_maskmem=7,
backbone=backbone,
backbone_stride=14,
transformer=transformer,
maskmem_backbone=maskmem_backbone,
# SAM parameters
multimask_output_in_sam=True,
# Evaluation
forward_backbone_per_frame_for_eval=True,
trim_past_non_cond_mem_for_eval=False,
# Multimask
multimask_output_for_tracking=True,
multimask_min_pt_num=0,
multimask_max_pt_num=1,
# Additional settings
always_start_from_first_ann_frame=False,
# Mask overlap
non_overlap_masks_for_mem_enc=False,
non_overlap_masks_for_output=False,
max_cond_frames_in_attn=4,
offload_output_to_cpu_for_eval=False,
# SAM decoder settings
sam_mask_decoder_extra_args={
"dynamic_multimask_via_stability": True,
"dynamic_multimask_stability_delta": 0.05,
"dynamic_multimask_stability_thresh": 0.98,
},
clear_non_cond_mem_around_input=True,
fill_hole_area=0,
use_memory_selection=apply_temporal_disambiguation,
)
return model
def _create_text_encoder(bpe_path: str) -> VETextEncoder:
"""Create SAM3 text encoder."""
tokenizer = SimpleTokenizer(bpe_path=bpe_path)
return VETextEncoder(
tokenizer=tokenizer,
d_model=256,
width=1024,
heads=16,
layers=24,
)
def _create_vision_backbone(
compile_mode=None, enable_inst_interactivity=True
) -> Sam3DualViTDetNeck:
"""Create SAM3 visual backbone with ViT and neck."""
# Position encoding
position_encoding = _create_position_encoding(precompute_resolution=1008)
# ViT backbone
vit_backbone: ViT = _create_vit_backbone(compile_mode=compile_mode)
vit_neck: Sam3DualViTDetNeck = _create_vit_neck(
position_encoding,
vit_backbone,
enable_inst_interactivity=enable_inst_interactivity,
)
# Visual neck
return vit_neck
def _create_sam3_transformer(has_presence_token: bool = True) -> TransformerWrapper:
"""Create SAM3 transformer encoder and decoder."""
encoder: TransformerEncoderFusion = _create_transformer_encoder()
decoder: TransformerDecoder = _create_transformer_decoder()
return TransformerWrapper(encoder=encoder, decoder=decoder, d_model=256)
def _load_checkpoint(model, checkpoint_path):
"""Load model checkpoint from file."""
with g_pathmgr.open(checkpoint_path, "rb") as f:
ckpt = torch.load(f, map_location="cpu", weights_only=True)
if "model" in ckpt and isinstance(ckpt["model"], dict):
ckpt = ckpt["model"]
sam3_image_ckpt = {
k.replace("detector.", ""): v for k, v in ckpt.items() if "detector" in k
}
if model.inst_interactive_predictor is not None:
sam3_image_ckpt.update(
{
k.replace("tracker.", "inst_interactive_predictor.model."): v
for k, v in ckpt.items()
if "tracker" in k
}
)
missing_keys, _ = model.load_state_dict(sam3_image_ckpt, strict=False)
if len(missing_keys) > 0:
print(
f"loaded {checkpoint_path} and found "
f"missing and/or unexpected keys:\n{missing_keys=}"
)
def _setup_device_and_mode(model, device, eval_mode):
"""Setup model device and evaluation mode."""
if device == "cuda":
model = model.cuda()
if eval_mode:
model.eval()
return model
def build_sam3_image_model(
bpe_path=None,
device="cuda" if torch.cuda.is_available() else "cpu",
eval_mode=True,
checkpoint_path="sam3.pt",
load_from_HF=False,
enable_segmentation=True,
enable_inst_interactivity=False,
compile=False,
):
"""
Build SAM3 image model
Args:
bpe_path: Path to the BPE tokenizer vocabulary
device: Device to load the model on ('cuda' or 'cpu')
eval_mode: Whether to set the model to evaluation mode
checkpoint_path: Optional path to model checkpoint
enable_segmentation: Whether to enable segmentation head
enable_inst_interactivity: Whether to enable instance interactivity (SAM 1 task)
compile_mode: To enable compilation, set to "default"
Returns:
A SAM3 image model
"""
if bpe_path is None:
bpe_path = os.path.join(
os.path.dirname(__file__), "..", "assets", "bpe_simple_vocab_16e6.txt.gz"
)
# Create visual components
compile_mode = "default" if compile else None
vision_encoder = _create_vision_backbone(
compile_mode=compile_mode, enable_inst_interactivity=enable_inst_interactivity
)
# Create text components
text_encoder = _create_text_encoder(bpe_path)
# Create visual-language backbone
backbone = _create_vl_backbone(vision_encoder, text_encoder)
# Create transformer components
transformer = _create_sam3_transformer()
# Create dot product scoring
dot_prod_scoring = _create_dot_product_scoring()
# Create segmentation head if enabled
segmentation_head = (
_create_segmentation_head(compile_mode=compile_mode)
if enable_segmentation
else None
)
# Create geometry encoder
input_geometry_encoder = _create_geometry_encoder()
if enable_inst_interactivity:
sam3_pvs_base = build_tracker(apply_temporal_disambiguation=False)
inst_predictor = SAM3InteractiveImagePredictor(sam3_pvs_base)
else:
inst_predictor = None
# Create the SAM3 model
model = _create_sam3_model(
backbone,
transformer,
input_geometry_encoder,
segmentation_head,
dot_prod_scoring,
inst_predictor,
eval_mode,
)
if load_from_HF and checkpoint_path is None:
checkpoint_path = download_ckpt_from_hf()
# Load checkpoint if provided
if checkpoint_path is not None:
_load_checkpoint(model, checkpoint_path)
# Setup device and mode
model = _setup_device_and_mode(model, device, eval_mode)
return model
def download_ckpt_from_hf():
SAM3_MODEL_ID = "facebook/sam3"
SAM3_CKPT_NAME = "sam3.pt"
SAM3_CFG_NAME = "config.json"
_ = hf_hub_download(repo_id=SAM3_MODEL_ID, filename=SAM3_CFG_NAME)
checkpoint_path = hf_hub_download(repo_id=SAM3_MODEL_ID, filename=SAM3_CKPT_NAME)
return checkpoint_path
def build_sam3_video_model(
checkpoint_path: Optional[str] = None,
load_from_HF=False,
bpe_path: Optional[str] = None,
has_presence_token: bool = True,
geo_encoder_use_img_cross_attn: bool = True,
strict_state_dict_loading: bool = True,
apply_temporal_disambiguation: bool = True,
device="cuda" if torch.cuda.is_available() else "cpu",
compile=False,
) -> Sam3VideoInferenceWithInstanceInteractivity:
"""
Build SAM3 dense tracking model.
Args:
checkpoint_path: Optional path to checkpoint file
bpe_path: Path to the BPE tokenizer file
Returns:
Sam3VideoInferenceWithInstanceInteractivity: The instantiated dense tracking model
"""
if bpe_path is None:
bpe_path = os.path.join(
os.path.dirname(__file__), "..", "assets", "bpe_simple_vocab_16e6.txt.gz"
)
# Build Tracker module
tracker = build_tracker(apply_temporal_disambiguation=apply_temporal_disambiguation)
# Build Detector components
visual_neck = _create_vision_backbone()
text_encoder = _create_text_encoder(bpe_path)
backbone = SAM3VLBackbone(scalp=1, visual=visual_neck, text=text_encoder)
transformer = _create_sam3_transformer(has_presence_token=has_presence_token)
segmentation_head: UniversalSegmentationHead = _create_segmentation_head()
input_geometry_encoder = _create_geometry_encoder()
# Create main dot product scoring
main_dot_prod_mlp = MLP(
input_dim=256,
hidden_dim=2048,
output_dim=256,
num_layers=2,
dropout=0.1,
residual=True,
out_norm=nn.LayerNorm(256),
)
main_dot_prod_scoring = DotProductScoring(
d_model=256, d_proj=256, prompt_mlp=main_dot_prod_mlp
)
# Build Detector module
detector = Sam3ImageOnVideoMultiGPU(
num_feature_levels=1,
backbone=backbone,
transformer=transformer,
segmentation_head=segmentation_head,
semantic_segmentation_head=None,
input_geometry_encoder=input_geometry_encoder,
use_early_fusion=True,
use_dot_prod_scoring=True,
dot_prod_scoring=main_dot_prod_scoring,
supervise_joint_box_scores=has_presence_token,
)
# Build the main SAM3 video model
if apply_temporal_disambiguation:
model = Sam3VideoInferenceWithInstanceInteractivity(
detector=detector,
tracker=tracker,
score_threshold_detection=0.5,
assoc_iou_thresh=0.1,
det_nms_thresh=0.1,
new_det_thresh=0.7,
hotstart_delay=15,
hotstart_unmatch_thresh=8,
hotstart_dup_thresh=8,
suppress_unmatched_only_within_hotstart=True,
min_trk_keep_alive=-1,
max_trk_keep_alive=30,
init_trk_keep_alive=30,
suppress_overlapping_based_on_recent_occlusion_threshold=0.7,
suppress_det_close_to_boundary=False,
fill_hole_area=16,
recondition_every_nth_frame=16,
masklet_confirmation_enable=False,
decrease_trk_keep_alive_for_empty_masklets=False,
image_size=1008,
image_mean=(0.5, 0.5, 0.5),
image_std=(0.5, 0.5, 0.5),
compile_model=compile,
)
else:
# a version without any heuristics for ablation studies
model = Sam3VideoInferenceWithInstanceInteractivity(
detector=detector,
tracker=tracker,
score_threshold_detection=0.5,
assoc_iou_thresh=0.1,
det_nms_thresh=0.1,
new_det_thresh=0.7,
hotstart_delay=0,
hotstart_unmatch_thresh=0,
hotstart_dup_thresh=0,
suppress_unmatched_only_within_hotstart=True,
min_trk_keep_alive=-1,
max_trk_keep_alive=30,
init_trk_keep_alive=30,
suppress_overlapping_based_on_recent_occlusion_threshold=0.7,
suppress_det_close_to_boundary=False,
fill_hole_area=16,
recondition_every_nth_frame=0,
masklet_confirmation_enable=False,
decrease_trk_keep_alive_for_empty_masklets=False,
image_size=1008,
image_mean=(0.5, 0.5, 0.5),
image_std=(0.5, 0.5, 0.5),
compile_model=compile,
)
# Load checkpoint if provided
if load_from_HF and checkpoint_path is None:
checkpoint_path = download_ckpt_from_hf()
if checkpoint_path is not None:
with g_pathmgr.open(checkpoint_path, "rb") as f:
ckpt = torch.load(f, map_location="cpu", weights_only=True)
if "model" in ckpt and isinstance(ckpt["model"], dict):
ckpt = ckpt["model"]
missing_keys, unexpected_keys = model.load_state_dict(
ckpt, strict=strict_state_dict_loading
)
if missing_keys:
print(f"Missing keys: {missing_keys}")
if unexpected_keys:
print(f"Unexpected keys: {unexpected_keys}")
model.to(device=device)
return model
def build_sam3_video_predictor(*model_args, gpus_to_use=None, **model_kwargs):
return Sam3VideoPredictorMultiGPU(
*model_args, gpus_to_use=gpus_to_use, **model_kwargs
)
现在对可视化进行修改,代码为visualization_utils.py,路径同样在sam3/sam3
由于Linux不能直接输出可视化界面,我直接修改了plot_results()函数,让它可以直接输出jpg
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
import json
import os
import subprocess
from pathlib import Path
import cv2
import matplotlib.patches as patches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pycocotools.mask as mask_utils
import torch
from matplotlib.colors import to_rgb
from PIL import Image
from skimage.color import lab2rgb, rgb2lab
from sklearn.cluster import KMeans
from torchvision.ops import masks_to_boxes
from tqdm import tqdm
def generate_colors(n_colors=256, n_samples=5000):
# Step 1: Random RGB samples
np.random.seed(42)
rgb = np.random.rand(n_samples, 3)
# Step 2: Convert to LAB for perceptual uniformity
# print(f"Converting {n_samples} RGB samples to LAB color space...")
lab = rgb2lab(rgb.reshape(1, -1, 3)).reshape(-1, 3)
# print("Conversion to LAB complete.")
# Step 3: k-means clustering in LAB
kmeans = KMeans(n_clusters=n_colors, n_init=10)
# print(f"Fitting KMeans with {n_colors} clusters on {n_samples} samples...")
kmeans.fit(lab)
# print("KMeans fitting complete.")
centers_lab = kmeans.cluster_centers_
# Step 4: Convert LAB back to RGB
colors_rgb = lab2rgb(centers_lab.reshape(1, -1, 3)).reshape(-1, 3)
colors_rgb = np.clip(colors_rgb, 0, 1)
return colors_rgb
COLORS = generate_colors(n_colors=128, n_samples=5000)
def show_img_tensor(img_batch, vis_img_idx=0):
MEAN_IMG = np.array([0.5, 0.5, 0.5])
STD_IMG = np.array([0.5, 0.5, 0.5])
im_tensor = img_batch[vis_img_idx].detach().cpu()
assert im_tensor.dim() == 3
im_tensor = im_tensor.numpy().transpose((1, 2, 0))
im_tensor = (im_tensor * STD_IMG) + MEAN_IMG
im_tensor = np.clip(im_tensor, 0, 1)
plt.imshow(im_tensor)
def draw_box_on_image(image, box, color=(0, 255, 0)):
"""
Draws a rectangle on a given PIL image using the provided box coordinates in xywh format.
:param image: PIL.Image - The image on which to draw the rectangle.
:param box: tuple - A tuple (x, y, w, h) representing the top-left corner, width, and height of the rectangle.
:param color: tuple - A tuple (R, G, B) representing the color of the rectangle. Default is red.
:return: PIL.Image - The image with the rectangle drawn on it.
"""
# Ensure the image is in RGB mode
image = image.convert("RGB")
# Unpack the box coordinates
x, y, w, h = box
x, y, w, h = int(x), int(y), int(w), int(h)
# Get the pixel data
pixels = image.load()
# Draw the top and bottom edges
for i in range(x, x + w):
pixels[i, y] = color
pixels[i, y + h - 1] = color
pixels[i, y + 1] = color
pixels[i, y + h] = color
pixels[i, y - 1] = color
pixels[i, y + h - 2] = color
# Draw the left and right edges
for j in range(y, y + h):
pixels[x, j] = color
pixels[x + 1, j] = color
pixels[x - 1, j] = color
pixels[x + w - 1, j] = color
pixels[x + w, j] = color
pixels[x + w - 2, j] = color
return image
def plot_bbox(
img_height,
img_width,
box,
box_format="XYXY",
relative_coords=True,
color="r",
linestyle="solid",
text=None,
ax=None,
):
if box_format == "XYXY":
x, y, x2, y2 = box
w = x2 - x
h = y2 - y
elif box_format == "XYWH":
x, y, w, h = box
elif box_format == "CxCyWH":
cx, cy, w, h = box
x = cx - w / 2
y = cy - h / 2
else:
raise RuntimeError(f"Invalid box_format {box_format}")
if relative_coords:
x *= img_width
w *= img_width
y *= img_height
h *= img_height
if ax is None:
ax = plt.gca()
rect = patches.Rectangle(
(x, y),
w,
h,
linewidth=1.5,
edgecolor=color,
facecolor="none",
linestyle=linestyle,
)
ax.add_patch(rect)
if text is not None:
facecolor = "w"
ax.text(
x,
y - 5,
text,
color=color,
weight="bold",
fontsize=8,
bbox={"facecolor": facecolor, "alpha": 0.75, "pad": 2},
)
def plot_mask(mask, color="r", ax=None):
im_h, im_w = mask.shape
mask_img = np.zeros((im_h, im_w, 4), dtype=np.float32)
mask_img[..., :3] = to_rgb(color)
mask_img[..., 3] = mask * 0.5
# Use the provided ax or the current axis
if ax is None:
ax = plt.gca()
ax.imshow(mask_img)
def normalize_bbox(bbox_xywh, img_w, img_h):
# Assumes bbox_xywh is in XYWH format
if isinstance(bbox_xywh, list):
assert (
len(bbox_xywh) == 4
), "bbox_xywh list must have 4 elements. Batching not support except for torch tensors."
normalized_bbox = bbox_xywh.copy()
normalized_bbox[0] /= img_w
normalized_bbox[1] /= img_h
normalized_bbox[2] /= img_w
normalized_bbox[3] /= img_h
else:
assert isinstance(
bbox_xywh, torch.Tensor
), "Only torch tensors are supported for batching."
normalized_bbox = bbox_xywh.clone()
assert (
normalized_bbox.size(-1) == 4
), "bbox_xywh tensor must have last dimension of size 4."
normalized_bbox[..., 0] /= img_w
normalized_bbox[..., 1] /= img_h
normalized_bbox[..., 2] /= img_w
normalized_bbox[..., 3] /= img_h
return normalized_bbox
def visualize_frame_output(frame_idx, video_frames, outputs, figsize=(12, 8)):
plt.figure(figsize=figsize)
plt.title(f"frame {frame_idx}")
img = load_frame(video_frames[frame_idx])
img_H, img_W, _ = img.shape
plt.imshow(img)
for i in range(len(outputs["out_probs"])):
box_xywh = outputs["out_boxes_xywh"][i]
prob = outputs["out_probs"][i]
obj_id = outputs["out_obj_ids"][i]
binary_mask = outputs["out_binary_masks"][i]
color = COLORS[obj_id % len(COLORS)]
plot_bbox(
img_H,
img_W,
box_xywh,
text=f"(id={obj_id}, {prob=:.2f})",
box_format="XYWH",
color=color,
)
plot_mask(binary_mask, color=color)
def visualize_formatted_frame_output(
frame_idx,
video_frames,
outputs_list,
titles=None,
points_list=None,
points_labels_list=None,
figsize=(12, 8),
title_suffix="",
prompt_info=None,
):
"""Visualize up to three sets of segmentation masks on a video frame.
Args:
frame_idx: Frame index to visualize
image_files: List of image file paths
outputs_list: List of {frame_idx: {obj_id: mask_tensor}} or single dict {obj_id: mask_tensor}
titles: List of titles for each set of outputs_list
points_list: Optional list of point coordinates
points_labels_list: Optional list of point labels
figsize: Figure size tuple
save: Whether to save the visualization to file
output_dir: Base output directory when saving
scenario_name: Scenario name for organizing saved files
title_suffix: Additional title suffix
prompt_info: Dictionary with prompt information (boxes, points, etc.)
"""
# Handle single output dict case
if isinstance(outputs_list, dict) and frame_idx in outputs_list:
# This is a single outputs dict with frame indices as keys
outputs_list = [outputs_list]
elif isinstance(outputs_list, dict) and not any(
isinstance(k, int) for k in outputs_list.keys()
):
# This is a single frame's outputs {obj_id: mask}
single_frame_outputs = {frame_idx: outputs_list}
outputs_list = [single_frame_outputs]
num_outputs = len(outputs_list)
if titles is None:
titles = [f"Set {i+1}" for i in range(num_outputs)]
assert (
len(titles) == num_outputs
), "length of `titles` should match that of `outputs_list` if not None."
_, axes = plt.subplots(1, num_outputs, figsize=figsize)
if num_outputs == 1:
axes = [axes] # Make it iterable
img = load_frame(video_frames[frame_idx])
img_H, img_W, _ = img.shape
for idx in range(num_outputs):
ax, outputs_set, ax_title = axes[idx], outputs_list[idx], titles[idx]
ax.set_title(f"Frame {frame_idx} - {ax_title}{title_suffix}")
ax.imshow(img)
if frame_idx in outputs_set:
_outputs = outputs_set[frame_idx]
else:
print(f"Warning: Frame {frame_idx} not found in outputs_set")
continue
if prompt_info and frame_idx == 0: # Show prompts on first frame
if "boxes" in prompt_info:
for box in prompt_info["boxes"]:
# box is in [x, y, w, h] normalized format
x, y, w, h = box
plot_bbox(
img_H,
img_W,
[x, y, x + w, y + h], # Convert to XYXY
box_format="XYXY",
relative_coords=True,
color="yellow",
linestyle="dashed",
text="PROMPT BOX",
ax=ax,
)
if "points" in prompt_info and "point_labels" in prompt_info:
points = np.array(prompt_info["points"])
labels = np.array(prompt_info["point_labels"])
# Convert normalized to pixel coordinates
points_pixel = points * np.array([img_W, img_H])
# Draw positive points (green stars)
pos_points = points_pixel[labels == 1]
if len(pos_points) > 0:
ax.scatter(
pos_points[:, 0],
pos_points[:, 1],
color="lime",
marker="*",
s=200,
edgecolor="white",
linewidth=2,
label="Positive Points",
zorder=10,
)
# Draw negative points (red stars)
neg_points = points_pixel[labels == 0]
if len(neg_points) > 0:
ax.scatter(
neg_points[:, 0],
neg_points[:, 1],
color="red",
marker="*",
s=200,
edgecolor="white",
linewidth=2,
label="Negative Points",
zorder=10,
)
objects_drawn = 0
for obj_id, binary_mask in _outputs.items():
mask_sum = (
binary_mask.sum()
if hasattr(binary_mask, "sum")
else np.sum(binary_mask)
)
if mask_sum > 0: # Only draw if mask has content
# Convert to torch tensor if it's not already
if not isinstance(binary_mask, torch.Tensor):
binary_mask = torch.tensor(binary_mask)
# Find bounding box from mask
if binary_mask.any():
box_xyxy = masks_to_boxes(binary_mask.unsqueeze(0)).squeeze()
box_xyxy = normalize_bbox(box_xyxy, img_W, img_H)
else:
# Fallback: create a small box at center
box_xyxy = [0.45, 0.45, 0.55, 0.55]
color = COLORS[obj_id % len(COLORS)]
plot_bbox(
img_H,
img_W,
box_xyxy,
text=f"(id={obj_id})",
box_format="XYXY",
color=color,
ax=ax,
)
# Convert back to numpy for plotting
mask_np = (
binary_mask.numpy()
if isinstance(binary_mask, torch.Tensor)
else binary_mask
)
plot_mask(mask_np, color=color, ax=ax)
objects_drawn += 1
if objects_drawn == 0:
ax.text(
0.5,
0.5,
"No objects detected",
transform=ax.transAxes,
fontsize=16,
ha="center",
va="center",
color="red",
weight="bold",
)
# Draw additional points if provided
if points_list is not None and points_list[idx] is not None:
show_points(
points_list[idx], points_labels_list[idx], ax=ax, marker_size=200
)
ax.axis("off")
plt.tight_layout()
plt.show()
def render_masklet_frame(img, outputs, frame_idx=None, alpha=0.5):
"""
Overlays masklets and bounding boxes on a single image frame.
Args:
img: np.ndarray, shape (H, W, 3), uint8 or float32 in [0,255] or [0,1]
outputs: dict with keys: out_boxes_xywh, out_probs, out_obj_ids, out_binary_masks
frame_idx: int or None, for overlaying frame index text
alpha: float, mask overlay alpha
Returns:
overlay: np.ndarray, shape (H, W, 3), uint8
"""
if img.dtype == np.float32 or img.max() <= 1.0:
img = (img * 255).astype(np.uint8)
img = img[..., :3] # drop alpha if present
height, width = img.shape[:2]
overlay = img.copy()
for i in range(len(outputs["out_probs"])):
obj_id = outputs["out_obj_ids"][i]
color = COLORS[obj_id % len(COLORS)]
color255 = (color * 255).astype(np.uint8)
mask = outputs["out_binary_masks"][i]
if mask.shape != img.shape[:2]:
mask = cv2.resize(
mask.astype(np.float32),
(img.shape[1], img.shape[0]),
interpolation=cv2.INTER_NEAREST,
)
mask_bool = mask > 0.5
for c in range(3):
overlay[..., c][mask_bool] = (
alpha * color255[c] + (1 - alpha) * overlay[..., c][mask_bool]
).astype(np.uint8)
# Draw bounding boxes and text
for i in range(len(outputs["out_probs"])):
box_xywh = outputs["out_boxes_xywh"][i]
obj_id = outputs["out_obj_ids"][i]
prob = outputs["out_probs"][i]
color = COLORS[obj_id % len(COLORS)]
color255 = tuple(int(x * 255) for x in color)
x, y, w, h = box_xywh
x1 = int(x * width)
y1 = int(y * height)
x2 = int((x + w) * width)
y2 = int((y + h) * height)
cv2.rectangle(overlay, (x1, y1), (x2, y2), color255, 2)
if prob is not None:
label = f"id={obj_id}, p={prob:.2f}"
else:
label = f"id={obj_id}"
cv2.putText(
overlay,
label,
(x1, max(y1 - 10, 0)),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
color255,
1,
cv2.LINE_AA,
)
# Overlay frame index at the top-left corner
if frame_idx is not None:
cv2.putText(
overlay,
f"Frame {frame_idx}",
(10, 30),
cv2.FONT_HERSHEY_SIMPLEX,
1.0,
(255, 255, 255),
2,
cv2.LINE_AA,
)
return overlay
def save_masklet_video(video_frames, outputs, out_path, alpha=0.5, fps=10):
# Each outputs dict has keys: "out_boxes_xywh", "out_probs", "out_obj_ids", "out_binary_masks"
# video_frames: list of video frame data, same length as outputs_list
# Read first frame to get size
first_img = load_frame(video_frames[0])
height, width = first_img.shape[:2]
if first_img.dtype == np.float32 or first_img.max() <= 1.0:
first_img = (first_img * 255).astype(np.uint8)
# Use 'mp4v' for best compatibility with VSCode playback (.mp4 files)
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter("temp.mp4", fourcc, fps, (width, height))
outputs_list = [
(video_frames[frame_idx], frame_idx, outputs[frame_idx])
for frame_idx in sorted(outputs.keys())
]
for frame, frame_idx, frame_outputs in tqdm(outputs_list):
img = load_frame(frame)
overlay = render_masklet_frame(
img, frame_outputs, frame_idx=frame_idx, alpha=alpha
)
writer.write(cv2.cvtColor(overlay, cv2.COLOR_RGB2BGR))
writer.release()
# Re-encode the video for VSCode compatibility using ffmpeg
subprocess.run(["ffmpeg", "-y", "-i", "temp.mp4", out_path])
print(f"Re-encoded video saved to {out_path}")
os.remove("temp.mp4") # Clean up temporary file
def save_masklet_image(frame, outputs, out_path, alpha=0.5, frame_idx=None):
"""
Save a single image with masklet overlays.
"""
img = load_frame(frame)
overlay = render_masklet_frame(img, outputs, frame_idx=frame_idx, alpha=alpha)
Image.fromarray(overlay).save(out_path)
print(f"Overlay image saved to {out_path}")
def prepare_masks_for_visualization(frame_to_output):
# frame_to_obj_masks --> {frame_idx: {'output_probs': np.array, `out_obj_ids`: np.array, `out_binary_masks`: np.array}}
for frame_idx, out in frame_to_output.items():
_processed_out = {}
for idx, obj_id in enumerate(out["out_obj_ids"].tolist()):
if out["out_binary_masks"][idx].any():
_processed_out[obj_id] = out["out_binary_masks"][idx]
frame_to_output[frame_idx] = _processed_out
return frame_to_output
def convert_coco_to_masklet_format(
annotations, img_info, is_prediction=False, score_threshold=0.5
):
"""
Convert COCO format annotations to format expected by render_masklet_frame
"""
outputs = {
"out_boxes_xywh": [],
"out_probs": [],
"out_obj_ids": [],
"out_binary_masks": [],
}
img_h, img_w = img_info["height"], img_info["width"]
for idx, ann in enumerate(annotations):
# Get bounding box in relative XYWH format
if "bbox" in ann:
bbox = ann["bbox"]
if max(bbox) > 1.0: # Convert absolute to relative coordinates
bbox = [
bbox[0] / img_w,
bbox[1] / img_h,
bbox[2] / img_w,
bbox[3] / img_h,
]
else:
mask = mask_utils.decode(ann["segmentation"])
rows = np.any(mask, axis=1)
cols = np.any(mask, axis=0)
if np.any(rows) and np.any(cols):
rmin, rmax = np.where(rows)[0][[0, -1]]
cmin, cmax = np.where(cols)[0][[0, -1]]
# Convert to relative XYWH
bbox = [
cmin / img_w,
rmin / img_h,
(cmax - cmin + 1) / img_w,
(rmax - rmin + 1) / img_h,
]
else:
bbox = [0, 0, 0, 0]
outputs["out_boxes_xywh"].append(bbox)
# Get probability/score
if is_prediction:
prob = ann["score"]
else:
prob = 1.0 # GT has no probability
outputs["out_probs"].append(prob)
outputs["out_obj_ids"].append(idx)
mask = mask_utils.decode(ann["segmentation"])
mask = (mask > score_threshold).astype(np.uint8)
outputs["out_binary_masks"].append(mask)
return outputs
def save_side_by_side_visualization(img, gt_anns, pred_anns, noun_phrase):
"""
Create side-by-side visualization of GT and predictions
"""
# Create side-by-side visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))
main_title = f"Noun phrase: '{noun_phrase}'"
fig.suptitle(main_title, fontsize=16, fontweight="bold")
gt_overlay = render_masklet_frame(img, gt_anns, alpha=0.5)
ax1.imshow(gt_overlay)
ax1.set_title("Ground Truth", fontsize=14, fontweight="bold")
ax1.axis("off")
pred_overlay = render_masklet_frame(img, pred_anns, alpha=0.5)
ax2.imshow(pred_overlay)
ax2.set_title("Predictions", fontsize=14, fontweight="bold")
ax2.axis("off")
plt.subplots_adjust(top=0.88)
plt.tight_layout()
def bitget(val, idx):
return (val >> idx) & 1
def pascal_color_map():
colormap = np.zeros((512, 3), dtype=int)
ind = np.arange(512, dtype=int)
for shift in reversed(list(range(8))):
for channel in range(3):
colormap[:, channel] |= bitget(ind, channel) << shift
ind >>= 3
return colormap.astype(np.uint8)
def draw_masks_to_frame(
frame: np.ndarray, masks: np.ndarray, colors: np.ndarray
) -> np.ndarray:
masked_frame = frame
for mask, color in zip(masks, colors):
curr_masked_frame = np.where(mask[..., None], color, masked_frame)
masked_frame = cv2.addWeighted(masked_frame, 0.75, curr_masked_frame, 0.25, 0)
if int(cv2.__version__[0]) > 3:
contours, _ = cv2.findContours(
np.array(mask, dtype=np.uint8).copy(),
cv2.RETR_TREE,
cv2.CHAIN_APPROX_NONE,
)
else:
_, contours, _ = cv2.findContours(
np.array(mask, dtype=np.uint8).copy(),
cv2.RETR_TREE,
cv2.CHAIN_APPROX_NONE,
)
cv2.drawContours(
masked_frame, contours, -1, (255, 255, 255), 7
) # White outer contour
cv2.drawContours(
masked_frame, contours, -1, (0, 0, 0), 5
) # Black middle contour
cv2.drawContours(
masked_frame, contours, -1, color.tolist(), 3
) # Original color inner contour
return masked_frame
def get_annot_df(file_path: str):
with open(file_path, "r") as f:
data = json.load(f)
dfs = {}
for k, v in data.items():
if k in ("info", "licenses"):
dfs[k] = v
continue
df = pd.DataFrame(v)
dfs[k] = df
return dfs
def get_annot_dfs(file_list: list[str]):
dfs = {}
for annot_file in tqdm(file_list):
dataset_name = Path(annot_file).stem
dfs[dataset_name] = get_annot_df(annot_file)
return dfs
def get_media_dir(media_dir: str, dataset: str):
if dataset in ["saco_veval_sav_test", "saco_veval_sav_val"]:
return os.path.join(media_dir, "saco_sav", "JPEGImages_24fps")
elif dataset in ["saco_veval_yt1b_test", "saco_veval_yt1b_val"]:
return os.path.join(media_dir, "saco_yt1b", "JPEGImages_6fps")
elif dataset in ["saco_veval_smartglasses_test", "saco_veval_smartglasses_val"]:
return os.path.join(media_dir, "saco_sg", "JPEGImages_6fps")
elif dataset == "sa_fari_test":
return os.path.join(media_dir, "sa_fari", "JPEGImages_6fps")
else:
raise ValueError(f"Dataset {dataset} not found")
def get_all_annotations_for_frame(
dataset_df: pd.DataFrame, video_id: int, frame_idx: int, data_dir: str, dataset: str
):
media_dir = os.path.join(data_dir, "media")
# Load the annotation and video data
annot_df = dataset_df["annotations"]
video_df = dataset_df["videos"]
# Get the frame
video_df_current = video_df[video_df.id == video_id]
assert (
len(video_df_current) == 1
), f"Expected 1 video row, got {len(video_df_current)}"
video_row = video_df_current.iloc[0]
file_name = video_row.file_names[frame_idx]
file_path = os.path.join(
get_media_dir(media_dir=media_dir, dataset=dataset), file_name
)
frame = cv2.imread(file_path)
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Get the masks and noun phrases annotated in this video in this frame
annot_df_current_video = annot_df[annot_df.video_id == video_id]
if len(annot_df_current_video) == 0:
print(f"No annotations found for video_id {video_id}")
return frame, None, None
else:
empty_mask = np.zeros(frame.shape[:2], dtype=np.uint8)
mask_np_pairs = annot_df_current_video.apply(
lambda row: (
(
mask_utils.decode(row.segmentations[frame_idx])
if row.segmentations[frame_idx]
else empty_mask
),
row.noun_phrase,
),
axis=1,
)
# sort based on noun_phrases
mask_np_pairs = sorted(mask_np_pairs, key=lambda x: x[1])
masks, noun_phrases = zip(*mask_np_pairs)
return frame, masks, noun_phrases
def visualize_prompt_overlay(
frame_idx,
video_frames,
title="Prompt Visualization",
text_prompt=None,
point_prompts=None,
point_labels=None,
bounding_boxes=None,
box_labels=None,
obj_id=None,
):
"""Simple prompt visualization function"""
img = Image.fromarray(load_frame(video_frames[frame_idx]))
fig, ax = plt.subplots(1, figsize=(6, 4))
ax.imshow(img)
img_w, img_h = img.size
if text_prompt:
ax.text(
0.02,
0.98,
f'Text: "{text_prompt}"',
transform=ax.transAxes,
fontsize=12,
color="white",
weight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="red", alpha=0.7),
verticalalignment="top",
)
if point_prompts:
for i, point in enumerate(point_prompts):
x, y = point
# Convert relative to absolute coordinates
x_img, y_img = x * img_w, y * img_h
# Use different colors for positive/negative points
if point_labels and len(point_labels) > i:
color = "green" if point_labels[i] == 1 else "red"
marker = "o" if point_labels[i] == 1 else "x"
else:
color = "green"
marker = "o"
ax.plot(
x_img,
y_img,
marker=marker,
color=color,
markersize=10,
markeredgewidth=2,
markeredgecolor="white",
)
ax.text(
x_img + 5,
y_img - 5,
f"P{i+1}",
color=color,
fontsize=10,
weight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.8),
)
if bounding_boxes:
for i, box in enumerate(bounding_boxes):
x, y, w, h = box
# Convert relative to absolute coordinates
x_img, y_img = x * img_w, y * img_h
w_img, h_img = w * img_w, h * img_h
# Use different colors for positive/negative boxes
if box_labels and len(box_labels) > i:
color = "green" if box_labels[i] == 1 else "red"
else:
color = "green"
rect = patches.Rectangle(
(x_img, y_img),
w_img,
h_img,
linewidth=2,
edgecolor=color,
facecolor="none",
)
ax.add_patch(rect)
ax.text(
x_img,
y_img - 5,
f"B{i+1}",
color=color,
fontsize=10,
weight="bold",
bbox=dict(boxstyle="round,pad=0.2", facecolor="white", alpha=0.8),
)
# Add object ID info if provided
if obj_id is not None:
ax.text(
0.02,
0.02,
f"Object ID: {obj_id}",
transform=ax.transAxes,
fontsize=10,
color="white",
weight="bold",
bbox=dict(boxstyle="round,pad=0.3", facecolor="blue", alpha=0.7),
verticalalignment="bottom",
)
ax.set_title(title)
ax.axis("off")
plt.tight_layout()
plt.show()
def plot_results(img, results, output_file='output.jpg'):
"""
在图像上绘制结果(边界框、掩膜),并将其保存为 .jpg 文件。
"""
plt.figure(figsize=(12, 8))
plt.imshow(img)
nb_objects = len(results["scores"])
print(f"检测到 {nb_objects} 个物体")
for i in range(nb_objects):
color = COLORS[i % len(COLORS)]
plot_mask(results["masks"][i].squeeze(0).cpu(), color=color)
w, h = img.size
prob = results["scores"][i].item()
plot_bbox(
h,
w,
results["boxes"][i].cpu(),
text=f"(id={i}, {prob=:.2f})",
box_format="XYXY",
color=color,
relative_coords=False,
)
# 保存为 .jpg 文件
plt.savefig(output_file, format='jpg', dpi=300)
plt.close() # 关闭图形,释放内存
def single_visualization(img, anns, title):
"""
Create a single image visualization with overlays.
"""
fig, ax = plt.subplots(figsize=(7, 7))
fig.suptitle(title, fontsize=16, fontweight="bold")
overlay = render_masklet_frame(img, anns, alpha=0.5)
ax.imshow(overlay)
ax.axis("off")
plt.tight_layout()
def show_mask(mask, ax, obj_id=None, random_color=False):
if random_color:
color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
else:
cmap = plt.get_cmap("tab10")
cmap_idx = 0 if obj_id is None else obj_id
color = np.array([*cmap(cmap_idx)[:3], 0.6])
h, w = mask.shape[-2:]
mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
ax.imshow(mask_image)
def show_box(box, ax):
x0, y0 = box[0], box[1]
w, h = box[2] - box[0], box[3] - box[1]
ax.add_patch(
plt.Rectangle((x0, y0), w, h, edgecolor="green", facecolor=(0, 0, 0, 0), lw=2)
)
def show_points(coords, labels, ax, marker_size=375):
pos_points = coords[labels == 1]
neg_points = coords[labels == 0]
ax.scatter(
pos_points[:, 0],
pos_points[:, 1],
color="green",
marker="*",
s=marker_size,
edgecolor="white",
linewidth=1.25,
)
ax.scatter(
neg_points[:, 0],
neg_points[:, 1],
color="red",
marker="*",
s=marker_size,
edgecolor="white",
linewidth=1.25,
)
def load_frame(frame):
if isinstance(frame, np.ndarray):
img = frame
elif isinstance(frame, Image.Image):
img = np.array(frame)
elif isinstance(frame, str) and os.path.isfile(frame):
img = plt.imread(frame)
else:
raise ValueError(f"Invalid video frame type: {type(frame)=}")
return img
END
这是我自己找的网图测试时候的效果,我的提示词输入的是mirror

图片均来源于网络,如有侵权请联系删除
1127

被折叠的 条评论
为什么被折叠?



