MiniCPM-V-2_6 (4-bit 量化)使用

原创已于 2025-11-25 17:07:01 修改 · 628 阅读

12 ·

CC 4.0 BY-SA版权

文章标签：

#java #数据库 #前端

于 2025-11-25 17:06:02 首次发布

MiniCPM-o 2.6

MiniCPM-o 2.6 is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.6, and introduces new features for real-time speech conversation and multimodal live streaming. Notable features of MiniCPM-o 2.6 include:

环境名称	版本信息1
Ubuntu	Ubuntu 24.04.3 LTS
Cuda	13.0
Python	Python 3.10.19
NVIDIA Corporation	RTX 5060 Ti 16G
NVIDIA-SMI	580.105.08
模型	MiniCPM-V-2_6 4-bit 量化
引入库	import torch from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig

对比结果：分析速度比qwen3-VL:8B，在分析图片的速度上快大约10倍。

# 正在分析: images/imgs/haveProblem/snap1761201924.001.jpg

# 📁 已移动: snap1761201924.001.jpg -> have_problem

# ✅ 完成: snap1761201924.001.jpg | 耗时: 1.09s | 已完成: 2271/2496 | 结果: 占道经营...

# --------------------------------------------------------------------------------

# 正在分析: images/imgs/haveProblem/snap1761201925.001.jpg

# 📁 已移动: snap1761201925.001.jpg -> have_problem

# ✅ 完成: snap1761201925.001.jpg | 耗时: 1.2s | 已完成: 2272/2496 | 结果: 占道经营, 乱扔垃圾...

模型核心代码如下。已删除业务逻辑代码。

# 模型路径

MODEL_ID = "/home/wr/.cache/modelscope/hub/models/OpenBMB/MiniCPM-V-2_6"

# 图片文件夹路径

IMAGE_FOLDER = "images/imgs"

# 支持的图片扩展名

IMG_EXTENSIONS = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG", ".bmp", ".BMP", ".gif", ".GIF"}

# === 加载模型（4-bit 量化）===

print("正在加载模型...")

quantization_config = BitsAndBytesConfig(

load_in_4bit=True,

bnb_4bit_compute_dtype=torch.float16,

bnb_4bit_quant_type="nf4",

bnb_4bit_use_double_quant=True

)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

model = AutoModel.from_pretrained(

MODEL_ID,

trust_remote_code=True,

device_map="cuda",

quantization_config=quantization_config

)

model.eval()

print("模型加载完成。")

# === 解析模型输出 ===

def parse_issue_response(raw_response: str) -> dict:

try:

raw_json = re.search(r"\{.*\}", raw_response, re.DOTALL)

if raw_json:

json_data = json.loads(raw_json.group())

return correct_structured_response(json_data)

else:

return correct_structured_response({"description": raw_response, "address": raw_response, "location": raw_response})

except Exception as e:

return correct_structured_response({})

# === 分析单张图片 ===

def analyze_image(image_path: Path) -> dict:

try:

image = Image.open(image_path).convert("RGB")

# 提示词：自动嵌入全局参数VALID_ISSUE_TYPES和VALID_LOCATIONS，无需手动同步

issue_types_str = "、".join(VALID_ISSUE_TYPES)

locations_str = "、".join(VALID_LOCATIONS[:-1]) + "、" + VALID_LOCATIONS[-1] # 处理最后一个元素的标点

question = f"""必须输出JSON格式字符串，不得添加任何额外文字！

判断图中是否存在以下城市管理问题（仅可选）：[{issue_types_str}]

JSON字段要求：

{{

}}

若不存在问题，仅输出JSON：{{"has_issue": false, "description": "无问题"}}

"""

msgs = [{"role": "user", "content": question}]

start_time = time.time()

raw_response = model.chat(

image=image,

msgs=msgs,

tokenizer=tokenizer

)

end_time = time.time()

torch.cuda.empty_cache()

analysis_time = round(end_time - start_time, 2)

# === 主流程 ===

for idx, img_path in enumerate(sorted(image_paths), 1):

print(f"正在分析: {img_path}")

result = analyze_image(img_path)

target_name = img_path.name

if result["error"]:

target_path = except_img_dir / target_name

append_to_json({

"image_path": result["image_path"],

"response": result["response"],

"analysis_time_seconds": result["analysis_time_seconds"]

}, "except.json")

elif is_no_problem(result["response"]):

target_path = no_problem_dir / target_name

append_to_json({

"image_path": result["image_path"],

"response": result["response"],

"analysis_time_seconds": result["analysis_time_seconds"]

}, "no_problem.json")

else:

target_path = have_problem_dir / target_name

append_to_json({

"image_path": result["image_path"],

"response": result["response"],

"analysis_time_seconds": result["analysis_time_seconds"]

}, "have_problem.json")