Florence-2-large-ft部署指南:云端与本地部署方案
【免费下载链接】Florence-2-large-ft 项目地址: https://ai.gitcode.com/mirrors/Microsoft/Florence-2-large-ft
概述
Florence-2-large-ft是微软推出的多模态视觉基础模型,具备0.77B参数规模,经过下游任务精调,能够处理图像描述、目标检测、OCR等多种视觉任务。本文将详细介绍Florence-2-large-ft的云端和本地部署方案,帮助开发者快速上手。
模型特性
| 特性 | 规格 | 说明 |
|---|---|---|
| 参数量 | 0.77B | 中等规模,平衡性能与效率 |
| 精度 | FP16 | 支持半精度浮点运算 |
| 架构 | Encoder-Decoder | 序列到序列生成架构 |
| 输入分辨率 | 768×768 | 标准输入图像尺寸 |
| 最大序列长度 | 1024 | 文本序列最大长度 |
环境要求
硬件要求
软件依赖
# 核心依赖
pip install torch>=1.13.0
pip install transformers>=4.41.0
pip install Pillow>=9.0.0
pip install requests>=2.28.0
# 可选依赖
pip install accelerate # 分布式推理加速
pip install bitsandbytes # 量化支持
云端部署方案
Hugging Face Inference API
import requests
import base64
from PIL import Image
import io
# 配置API端点
API_URL = "https://api-inference.huggingface.co/models/microsoft/Florence-2-large-ft"
headers = {"Authorization": "Bearer YOUR_HF_TOKEN"}
def query_florence2(image_path, prompt="<CAPTION>"):
# 读取并编码图像
with open(image_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode("utf-8")
# 构建请求载荷
payload = {
"inputs": {
"image": image_data,
"text": prompt
}
}
# 发送请求
response = requests.post(API_URL, headers=headers, json=payload)
return response.json()
# 使用示例
result = query_florence2("car.jpg", "<OD>")
print(result)
AWS SageMaker部署
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
# 配置SageMaker
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
# 创建模型
huggingface_model = HuggingFaceModel(
model_data="s3://your-bucket/florence-2-large-ft/model.tar.gz",
role=role,
transformers_version="4.41",
pytorch_version="2.0",
py_version="py310",
)
# 部署端点
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g4dn.xlarge"
)
# 推理调用
result = predictor.predict({
"image": image_base64,
"prompt": "<CAPTION>"
})
本地部署方案
基础本地部署
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
class Florence2Deployer:
def __init__(self, device=None, torch_dtype=None):
self.device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
self.torch_dtype = torch_dtype or (torch.float16 if torch.cuda.is_available() else torch.float32)
# 加载模型和处理器
self.model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large-ft",
torch_dtype=self.torch_dtype,
trust_remote_code=True
).to(self.device)
self.processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-large-ft",
trust_remote_code=True
)
def inference(self, image, prompt, max_new_tokens=1024):
"""执行推理"""
inputs = self.processor(
text=prompt,
images=image,
return_tensors="pt"
).to(self.device, self.torch_dtype)
generated_ids = self.model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=max_new_tokens,
do_sample=False,
num_beams=3
)
generated_text = self.processor.batch_decode(
generated_ids,
skip_special_tokens=False
)[0]
return self.processor.post_process_generation(
generated_text,
task=prompt,
image_size=(image.width, image.height)
)
# 使用示例
deployer = Florence2Deployer()
image = Image.open("car.jpg")
result = deployer.inference(image, "<OD>")
print(result)
Docker容器化部署
# Dockerfile
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
WORKDIR /app
# 安装依赖
RUN pip install transformers==4.41.0 Pillow==9.5.0 requests==2.28.2
# 复制模型文件
COPY . .
# 暴露端口
EXPOSE 8000
# 启动FastAPI服务
CMD ["python", "app.py"]
# app.py - FastAPI服务
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import io
app = FastAPI(title="Florence-2-large-ft API")
# 全局模型实例
model = None
processor = None
@app.on_event("startup")
async def load_model():
global model, processor
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large-ft",
torch_dtype=torch_dtype,
trust_remote_code=True
).to(device)
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-large-ft",
trust_remote_code=True
)
@app.post("/inference")
async def inference_endpoint(
image: UploadFile = File(...),
prompt: str = "<CAPTION>"
):
# 读取图像
image_data = await image.read()
img = Image.open(io.BytesIO(image_data))
# 执行推理
inputs = processor(text=prompt, images=img, return_tensors="pt")
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
result = processor.post_process_generation(
generated_text,
task=prompt,
image_size=(img.width, img.height)
)
return JSONResponse(content=result)
性能优化策略
量化部署
from transformers import BitsAndBytesConfig
import torch
# 4-bit量化配置
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
)
# 量化模型加载
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Florence-2-large-ft",
quantization_config=quantization_config,
torch_dtype=torch.float16,
trust_remote_code=True
)
批处理优化
def batch_inference(images, prompts):
"""批处理推理优化"""
processed_inputs = []
for image, prompt in zip(images, prompts):
inputs = processor(text=prompt, images=image, return_tensors="pt")
processed_inputs.append(inputs)
# 合并批处理
batch = {
"input_ids": torch.cat([x["input_ids"] for x in processed_inputs], dim=0),
"pixel_values": torch.cat([x["pixel_values"] for x in processed_inputs], dim=0),
"attention_mask": torch.cat([x.get("attention_mask", torch.ones_like(x["input_ids"]))
for x in processed_inputs], dim=0)
}
generated_ids = model.generate(**batch, max_new_tokens=1024, num_beams=3)
results = []
for i, gen_ids in enumerate(generated_ids):
generated_text = processor.batch_decode([gen_ids], skip_special_tokens=False)[0]
result = processor.post_process_generation(
generated_text,
task=prompts[i],
image_size=(images[i].width, images[i].height)
)
results.append(result)
return results
任务提示词参考
Florence-2-large-ft支持多种视觉任务,通过不同的提示词触发:
| 任务类型 | 提示词 | 输出格式 |
|---|---|---|
| 图像描述 | <CAPTION> | 文本描述 |
| 详细描述 | <DETAILED_CAPTION> | 详细文本描述 |
| 目标检测 | <OD> | 边界框和标签 |
| OCR识别 | <OCR> | 识别文本 |
| 区域OCR | <OCR_WITH_REGION> | 带位置的文本 |
| 区域提议 | <REGION_PROPOSAL> | 候选区域 |
监控与日志
import logging
import time
from prometheus_client import Counter, Histogram
# 监控指标
REQUEST_COUNT = Counter('inference_requests_total', 'Total inference requests')
REQUEST_LATENCY = Histogram('inference_latency_seconds', 'Inference latency')
class MonitoredFlorence2(Florence2Deployer):
@REQUEST_LATENCY.time()
def inference(self, image, prompt, max_new_tokens=1024):
REQUEST_COUNT.inc()
start_time = time.time()
result = super().inference(image, prompt, max_new_tokens)
latency = time.time() - start_time
logging.info(f"Inference completed in {latency:.2f}s")
return result
故障排除
常见问题及解决方案
| 问题 | 原因 | 解决方案 |
|---|---|---|
| CUDA内存不足 | 显存不足 | 使用量化或减小批处理大小 |
| 推理速度慢 | 硬件限制 | 启用TensorRT优化 |
| 模型加载失败 | 网络问题 | 使用本地模型文件 |
| 输出格式错误 | 提示词错误 | 检查提示词格式 |
健康检查端点
@app.get("/health")
async def health_check():
return {
"status": "healthy",
"model_loaded": model is not None,
"device": str(model.device) if model else "none",
"timestamp": time.time()
}
部署架构对比
总结
Florence-2-large-ft作为多模态视觉基础模型,提供了灵活的部署方案。云端部署适合快速原型开发和中小规模应用,而本地部署更适合对数据安全性和延迟有严格要求的企业场景。通过合理的性能优化和监控策略,可以在各种环境下实现高效稳定的模型服务。
选择部署方案时,需要综合考虑业务需求、资源约束和技术团队能力,选择最适合的部署策略。无论选择哪种方案,Florence-2-large-ft都能为视觉AI应用提供强大的基础能力支撑。
【免费下载链接】Florence-2-large-ft 项目地址: https://ai.gitcode.com/mirrors/Microsoft/Florence-2-large-ft
创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考



