GroundingDINO部署、使用提示词批量标注目标检测数据集

原创已于 2025-09-17 16:23:51 修改 · 786 阅读

18 ·

CC 4.0 BY-SA版权

文章标签：

#目标检测 #人工智能 #计算机视觉

于 2025-09-17 16:21:31 首次发布

使用GroundingDINO提示词进行目标检测单张或批量标注。自己看教程研究的，整理出来的过程如果有问题还请指正。

参考教程视频：https://www.bilibili.com/video/BV1dU1BYXEgj/?spm_id_from=333.337.search-card.all.click&vd_source=7d778d43a7af5bca30ce6c124550edfd
【提示】本方法需要魔法（梯子）

一、下载及部署

项目官方地址：
https://github.com/IDEA-Research/GroundingDINO

环境配置参考：
https://github.com/Dongdong-d/GroundingDino-Finetuning

官网下载好项目后，参考环境配置项目的readme 完成下方操作：

1.下载权重：

两个权重可以选一个下载
Swin-T：https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

Swin-B：https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth

2.环境配置

安装依赖

pip install -r requirements.txt
#注意 numpy < 1.24.* , yapf==0.40.1

安装groundingdino

cd GroundingDino-Finetuning/
pip install -e .

安装MultiScaleDeformableAttention注意力

cd groundingdino/models/GroundingDINO/ops
python setup.py install

测试MultiScaleDeformableAttention安装是否成功

python test.py

出现以下内容即为成功：

True check_forward_equal_with_pytorch_double: max_abs_err 8.67e-19 max_rel_err 2.35e-16
True check_forward_equal_with_pytorch_float: max_abs_err 4.66e-10 max_rel_err 1.13e-07
True check_gradient_numerical(D=30)
True check_gradient_numerical(D=32)
True check_gradient_numerical(D=64)
True check_gradient_numerical(D=71)

二、提示词标注脚本

【重要！！！】下面脚本执行方法：不能直接run，需要梯子复制端口到pycharm的控制台中，然后使用python demo.py语句来执行（具体操作请看开头的bilibili视频）

1.处理单张图像（这是视频教程中的代码）

在官方项目中新建一个脚本，将下面代码复制进去，注意修改路径（防止出错最好使用绝对路径）！

from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
import cv2
CONFIG_PATH = "groundingdino/config/GroundingDINO_SwinT_OGC.py"    #源码自带的配置文件
CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth"   #上面下载的权重文件
DEVICE = "cuda"   #可以选择cpu/cuda
IMAGE_PATH = "./testIMG.png"    #待检测的image的路径
TEXT_PROMPT = "There are many cars, people, and motorcycles on the street"    #用户给出的文本提示（需自己修改）
BOX_TRESHOLD = 0.35     #源码给定的边界框判定阈值
TEXT_TRESHOLD = 0.25    #源码给定的文本端获取关键属性阈值
image_source, image = load_image(IMAGE_PATH)
model = load_model(CONFIG_PATH, CHECKPOINT_PATH)
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD,
    device=DEVICE,
)
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

2. 批量标注多张图像，并且保存LabelImg格式的TXT标注文件，方便做后续处理

import os
import cv2
import re
from groundingdino.util.inference import load_model, load_image, predict, annotate

# ====== 用户配置区域 ======
# 1. 模型配置（需修改成自己的目录）
CONFIG_PATH = "/GroundingDINO-main/groundingdino/config/GroundingDINO_SwinT_OGC.py"
CHECKPOINT_PATH = "/GroundingDINO-main/groundingdino_swint_ogc.pth"
DEVICE = "cpu"  # 可选: "cuda" 如果有GPU。（这里会提示一个没有C++的一个部分，我没有去解决，就用的cpu）

# 2. 数据路径配置（需修改成自己的目录）
IMAGE_DIR = "/GroundingDINO-main/myDataset/images"  # 输入图像目录
OUTPUT_DIR = "/GroundingDINO-main/myDataset/output"  # 输出结果目录（输出检测后的图像以及labelimg可以打开的txt格式文件）

# 3. 检测参数配置（需要多次训练调节下方参数，尤其关键词要根据你要检测的物体特征进行描述！）
TEXT_PROMPT = "white round disc.There are many cars, people,and motorcycles on the street"
BOX_TRESHOLD = 0.22  # 框检测阈值 (0.3-0.5)
TEXT_TRESHOLD = 0.25  # 文本检测阈值 (0.25-0.35)


# ====== 主程序开始 ======
def main():
    # 确保输出目录存在
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # 加载模型 (只需加载一次)
    print("加载模型中...")
    model = load_model(CONFIG_PATH, CHECKPOINT_PATH)
    print(f"模型加载完成，使用设备: {DEVICE}")

    # 获取所有支持的图像文件
    image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')
    image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(image_extensions)]
    total_images = len(image_files)

    print(f"在目录 {IMAGE_DIR} 中找到 {total_images} 张图像")
    print(f"使用提示词: {TEXT_PROMPT}")
    print(f"参数设置: box_threshold={BOX_TRESHOLD}, text_threshold={TEXT_TRESHOLD}")

    # 处理每张图像
    for i, image_file in enumerate(image_files):
        image_path = os.path.join(IMAGE_DIR, image_file)
        print(f"\n处理图像 {i + 1}/{total_images}: {image_file}")

        try:
            # 加载图像
            image_source, image = load_image(image_path)

            # 进行预测
            boxes, logits, phrases = predict(
                model=model,
                image=image,
                caption=TEXT_PROMPT,
                box_threshold=BOX_TRESHOLD,
                text_threshold=TEXT_TRESHOLD,
                device=DEVICE,
            )

            # 显示检测结果
            print(f"检测到 {len(boxes)} 个目标: {phrases}")

            # ====== 新增：创建映射后的标签列表用于图像标注 ======
            mapped_phrases = []
            for phrase in phrases:
                # 标准化短语：小写+去除特殊字符
                clean_phrase = phrase.strip().lower()
                clean_phrase = re.sub(r'[^a-z0-9\s]', '', clean_phrase)  # 移除非字母数字字符

                # 基于关键词的标签映射（这里是我自己的逻辑处理，可以将关键词转换成规范的label，
                #不然标注出来的图像标签会不统一）
                if "grass" in clean_phrase:
                    class_name = "grass"
                else:
                    class_name = "circle"  # 无法识别的标签设为circle

                mapped_phrases.append(class_name)

            # 生成并保存标注图像（使用映射后的标签）
            annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=mapped_phrases)
            image_output_path = os.path.join(OUTPUT_DIR, f"annotated_{image_file}")
            cv2.imwrite(image_output_path, annotated_frame)
            print(f"标注图像已保存至: {image_output_path}")

            # ====== 保存LabelImg格式的TXT标注文件 ======
            # 获取图像尺寸 (用于坐标转换)
            height, width, _ = image_source.shape

            # 创建TXT文件路径 (与图像同名，扩展名改为.txt)
            txt_filename = os.path.splitext(image_file)[0] + ".txt"
            txt_output_path = os.path.join(OUTPUT_DIR, txt_filename)

            with open(txt_output_path, "w") as f:
                # 写入每个检测目标的信息
                for box, phrase in zip(boxes, phrases):
                    # 标准化短语：小写+去除特殊字符
                    clean_phrase = phrase.strip().lower()
                    clean_phrase = re.sub(r'[^a-z0-9\s]', '', clean_phrase)  # 移除非字母数字字符

                    # 基于关键词的标签映射（同上，这里处理labelimg中的标签，自行修改）
                    if "grass" in clean_phrase:
                        class_name = "grass"
                    else:
                        class_name = "circle"  # 无法识别的标签设为circle

                    # 解析边界框坐标 (中心点坐标和宽高)
                    center_x, center_y, w, h = box

                    # 转换为YOLO格式的归一化坐标
                    center_x_norm = center_x
                    center_y_norm = center_y
                    w_norm = w
                    h_norm = h

                    # 写入YOLO格式的行: class_id center_x center_y width height
                    f.write(f"{class_id} {center_x_norm:.6f} {center_y_norm:.6f} {w_norm:.6f} {h_norm:.6f}\n")

            print(f"YOLO格式标注已保存至: {txt_output_path}")

        except Exception as e:
            print(f"处理图像 {image_file} 时出错: {str(e)}")

    print("\n===== 批量标注完成 =====")
    print(f"所有标注结果保存在: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()