使用GroundingDINO提示词进行目标检测单张或批量标注。自己看教程研究的,整理出来的过程如果有问题还请指正。
参考教程视频:https://www.bilibili.com/video/BV1dU1BYXEgj/?spm_id_from=333.337.search-card.all.click&vd_source=7d778d43a7af5bca30ce6c124550edfd
【提示】本方法需要魔法(梯子)
一、 下载及部署
项目官方地址:
https://github.com/IDEA-Research/GroundingDINO
环境配置参考:
https://github.com/Dongdong-d/GroundingDino-Finetuning
官网下载好项目后,参考环境配置项目的readme 完成下方操作:
1.下载权重:
两个权重可以选一个下载
Swin-T:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
Swin-B:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha2/groundingdino_swinb_cogcoor.pth
2.环境配置
- 安装依赖
pip install -r requirements.txt
#注意 numpy < 1.24.* , yapf==0.40.1
- 安装groundingdino
cd GroundingDino-Finetuning/
pip install -e .
- 安装MultiScaleDeformableAttention注意力
cd groundingdino/models/GroundingDINO/ops
python setup.py install
- 测试MultiScaleDeformableAttention安装是否成功
python test.py
出现以下内容即为成功:
True check_forward_equal_with_pytorch_double: max_abs_err 8.67e-19 max_rel_err 2.35e-16
True check_forward_equal_with_pytorch_float: max_abs_err 4.66e-10 max_rel_err 1.13e-07
True check_gradient_numerical(D=30)
True check_gradient_numerical(D=32)
True check_gradient_numerical(D=64)
True check_gradient_numerical(D=71)
二、提示词标注脚本
【重要!!!】下面脚本执行方法:不能直接run,需要梯子复制端口到pycharm的控制台中,然后使用python demo.py语句来执行(具体操作请看开头的bilibili视频)
1.处理单张图像(这是视频教程中的代码)
在官方项目中新建一个脚本,将下面代码复制进去,注意修改路径(防止出错最好使用绝对路径)!
from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
import cv2
CONFIG_PATH = "groundingdino/config/GroundingDINO_SwinT_OGC.py" #源码自带的配置文件
CHECKPOINT_PATH = "./groundingdino_swint_ogc.pth" #上面下载的权重文件
DEVICE = "cuda" #可以选择cpu/cuda
IMAGE_PATH = "./testIMG.png" #待检测的image的路径
TEXT_PROMPT = "There are many cars, people, and motorcycles on the street" #用户给出的文本提示(需自己修改)
BOX_TRESHOLD = 0.35 #源码给定的边界框判定阈值
TEXT_TRESHOLD = 0.25 #源码给定的文本端获取关键属性阈值
image_source, image = load_image(IMAGE_PATH)
model = load_model(CONFIG_PATH, CHECKPOINT_PATH)
boxes, logits, phrases = predict(
model=model,
image=image,
caption=TEXT_PROMPT,
box_threshold=BOX_TRESHOLD,
text_threshold=TEXT_TRESHOLD,
device=DEVICE,
)
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)
2. 批量标注多张图像, 并且保存LabelImg格式的TXT标注文件,方便做后续处理
import os
import cv2
import re
from groundingdino.util.inference import load_model, load_image, predict, annotate
# ====== 用户配置区域 ======
# 1. 模型配置(需修改成自己的目录)
CONFIG_PATH = "/GroundingDINO-main/groundingdino/config/GroundingDINO_SwinT_OGC.py"
CHECKPOINT_PATH = "/GroundingDINO-main/groundingdino_swint_ogc.pth"
DEVICE = "cpu" # 可选: "cuda" 如果有GPU。(这里会提示一个没有C++的一个部分,我没有去解决,就用的cpu)
# 2. 数据路径配置(需修改成自己的目录)
IMAGE_DIR = "/GroundingDINO-main/myDataset/images" # 输入图像目录
OUTPUT_DIR = "/GroundingDINO-main/myDataset/output" # 输出结果目录(输出检测后的图像以及labelimg可以打开的txt格式文件)
# 3. 检测参数配置(需要多次训练调节下方参数,尤其关键词要根据你要检测的物体特征进行描述!)
TEXT_PROMPT = "white round disc.There are many cars, people,and motorcycles on the street"
BOX_TRESHOLD = 0.22 # 框检测阈值 (0.3-0.5)
TEXT_TRESHOLD = 0.25 # 文本检测阈值 (0.25-0.35)
# ====== 主程序开始 ======
def main():
# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 加载模型 (只需加载一次)
print("加载模型中...")
model = load_model(CONFIG_PATH, CHECKPOINT_PATH)
print(f"模型加载完成,使用设备: {DEVICE}")
# 获取所有支持的图像文件
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff')
image_files = [f for f in os.listdir(IMAGE_DIR) if f.lower().endswith(image_extensions)]
total_images = len(image_files)
print(f"在目录 {IMAGE_DIR} 中找到 {total_images} 张图像")
print(f"使用提示词: {TEXT_PROMPT}")
print(f"参数设置: box_threshold={BOX_TRESHOLD}, text_threshold={TEXT_TRESHOLD}")
# 处理每张图像
for i, image_file in enumerate(image_files):
image_path = os.path.join(IMAGE_DIR, image_file)
print(f"\n处理图像 {i + 1}/{total_images}: {image_file}")
try:
# 加载图像
image_source, image = load_image(image_path)
# 进行预测
boxes, logits, phrases = predict(
model=model,
image=image,
caption=TEXT_PROMPT,
box_threshold=BOX_TRESHOLD,
text_threshold=TEXT_TRESHOLD,
device=DEVICE,
)
# 显示检测结果
print(f"检测到 {len(boxes)} 个目标: {phrases}")
# ====== 新增:创建映射后的标签列表用于图像标注 ======
mapped_phrases = []
for phrase in phrases:
# 标准化短语:小写+去除特殊字符
clean_phrase = phrase.strip().lower()
clean_phrase = re.sub(r'[^a-z0-9\s]', '', clean_phrase) # 移除非字母数字字符
# 基于关键词的标签映射(这里是我自己的逻辑处理,可以将关键词转换成规范的label,
#不然标注出来的图像标签会不统一)
if "grass" in clean_phrase:
class_name = "grass"
else:
class_name = "circle" # 无法识别的标签设为circle
mapped_phrases.append(class_name)
# 生成并保存标注图像(使用映射后的标签)
annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=mapped_phrases)
image_output_path = os.path.join(OUTPUT_DIR, f"annotated_{image_file}")
cv2.imwrite(image_output_path, annotated_frame)
print(f"标注图像已保存至: {image_output_path}")
# ====== 保存LabelImg格式的TXT标注文件 ======
# 获取图像尺寸 (用于坐标转换)
height, width, _ = image_source.shape
# 创建TXT文件路径 (与图像同名,扩展名改为.txt)
txt_filename = os.path.splitext(image_file)[0] + ".txt"
txt_output_path = os.path.join(OUTPUT_DIR, txt_filename)
with open(txt_output_path, "w") as f:
# 写入每个检测目标的信息
for box, phrase in zip(boxes, phrases):
# 标准化短语:小写+去除特殊字符
clean_phrase = phrase.strip().lower()
clean_phrase = re.sub(r'[^a-z0-9\s]', '', clean_phrase) # 移除非字母数字字符
# 基于关键词的标签映射(同上,这里处理labelimg中的标签,自行修改)
if "grass" in clean_phrase:
class_name = "grass"
else:
class_name = "circle" # 无法识别的标签设为circle
# 解析边界框坐标 (中心点坐标和宽高)
center_x, center_y, w, h = box
# 转换为YOLO格式的归一化坐标
center_x_norm = center_x
center_y_norm = center_y
w_norm = w
h_norm = h
# 写入YOLO格式的行: class_id center_x center_y width height
f.write(f"{class_id} {center_x_norm:.6f} {center_y_norm:.6f} {w_norm:.6f} {h_norm:.6f}\n")
print(f"YOLO格式标注已保存至: {txt_output_path}")
except Exception as e:
print(f"处理图像 {image_file} 时出错: {str(e)}")
print("\n===== 批量标注完成 =====")
print(f"所有标注结果保存在: {OUTPUT_DIR}")
if __name__ == "__main__":
main()
如果有问题也可以参考这篇https://blog.youkuaiyun.com/qq_44442727/article/details/137677031
4032

被折叠的 条评论
为什么被折叠?



