MiniCPM-V-2_6如何从输入到输出-代码解析（一）

红酒暖心也暖胃

已于 2025-03-24 15:55:59 修改

阅读量430

点赞数 5

分类专栏： AIGC 文章标签：深度学习自然语言处理人工智能

于 2025-03-20 16:31:36 首次发布

本文链接：https://blog.youkuaiyun.com/zpp13hao1/article/details/146399548

版权

AIGC 专栏收录该内容

7 篇文章

订阅专栏

出发点

之前写的MINICPM-V2_6图像预处理流程-代码解读、ChatGLM2-6B如何从输入到输出-代码解析（一）等文章还是有不少人看的，NLP基本就是这样的处理流程了（当然有我还没发现的模型架构，比如说RWKV），这次尝试从Minicpm-V出发，将代码改写成我们熟悉的样子

模型结构

import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from transformers import AutoProcessor, Qwen2PreTrainedModel, Qwen2ForCausalLM, TextIteratorStreamer

model_path = '/usr/downloads/MiniCPM-V-2_6'
model = AutoModel.from_pretrained(model_path, trust_remote_code=True,
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager

model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 单张图像
image = Image.open(model_path + '/assets/radar_final.png').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image, question]}]
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer,sampling=False)# 这样采样结果就是一样的了
print(res)
# "The image is a radar chart comparing the performance of different models on various benchmarks. The benchmarks include Hallusion Bench, MathVista, AI2D, OCRBench, DocVQA, OpenCompass, MMB-1.1, MME, BLINK, Video-MME, Mantis, Object HalBench, ChartQA, and MMVet. The models compared are GPT-4V-20240409, Gemini 1.5 Pro, Cambrian-34B, InternVL2-8B, and MiniCPM-V 2.6 8B. Each model's performance is represented by a colored line, and the numerical values at the end of each line indicate the score or performance metric for that model on the respective benchmark."

# model
"""
MiniCPMV(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151666, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2Attention(
            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
        )
      )
      (norm): Qwen2RMSNorm((3584,), eps=1e-06)
      (rotary_emb): Qwen2RotaryEmbedding()
    )
    (lm_head): Linear(in_features=3584, out_features=151666, bias=False)
  )
  (vpm): SiglipVisionTransformer(
    (embeddings): SiglipVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
      (position_embedding): Embedding(4900, 1152)
    )
    (encoder): SiglipEncoder(
      (layers): ModuleList(
        (0-26): 27 x SiglipEncoderLayer(
          (self_attn): SiglipAttention(
            (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
            (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
            (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
            (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
          )
          (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
          (mlp): SiglipMLP(
            (activation_fn): PytorchGELUTanh()
            (fc1): Linear(in_features=1152, out_features=4304, bias=True)
            (fc2): Linear(in_features=4304, out_features=1152, bias=True)
          )
          (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
        )
      )
    )
    (post_layernorm): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
  )
  (resampler): Resampler(
    (kv_proj): Linear(in_features=1152, out_features=3584, bias=False)
    (attn): MultiheadAttention(
      (out_proj): Linear(in_features=3584, out_features=3584, bias=True)
    )
    (ln_q): LayerNorm((3584,), eps=1e-06, elementwise_affine=True)
    (ln_kv): LayerNorm((3584,), eps=1e-06, elementwise_affine=True)
    (ln_post): LayerNorm((3584,), eps=1e-06, elementwise_affine=True)
  )
)
"""

分为三部分：NLP使用的是Qwen，image使用的是SiglipVision，还使用了resampler将图片patch块统一
使用的是Qwen/Qwen2-7B-Instruct，只有token数量发生了变化，从152064到151666
参数统计部分可以看MiNicpm-o2.6和MiNicpm-V2.6模型架构对比
resampler可以看MINICPM-V2_6之图像embedding的resampler-代码解读
回答自己是谁的时候发现了一个有趣的现象，不明白为什么qwen的大模型会回复说自己是由openAI训练的，咱也不敢说，咱也不敢问，我只想说世界真的是个巨大的草台班子（20230324发现 MiniCPM-O-2_6也会这样回答）

question = 'Who are you?'
msgs = [{'role': 'user', 'content': [question]}]
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer,sampling=False)# 这样采样结果就是一样的了
print(res)
# 'I am ChatGPT, a large language model trained by OpenAI. I am designed to assist with answering questions and providing information on a wide range of topics. Is there something specific you would like to know?'

question = '你是谁?'
msgs = [{'role': 'user', 'content': [question]}]
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer,sampling=False)# 这样采样结果就是一样的了
print(res)
# '我是ChatGPT，一个由OpenAI训练的大型语言模型。我被设计成能够理解和生成类似人类的文本。今天我能为 您提供什么帮助？'

图像预处理

详细代码解读可以看MINICPM-V2_6图像预处理流程-代码解读
保存为image_process.py，后续有用
这部分的原始代码在image_processing_minicpmv.pyopenbmb/MiniCPM-V-2_6

# 一些基本变量
import math
from torchvision import transforms
image_feature_size = 64 # 每张图片的占位符数量
max_slice_nums = 9# 最多分的块数
scale_resolution = 448# 每块对应的最大宽或高 小块数量=448/14=32
patch_size = 14# patch_size

IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) # timm.data.IMAGENET_INCEPTION_MEAN
IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)  # timm.data.IMAGENET_INCEPTION_STD
transform = transforms.Compose(
            [
                transforms.ToTensor(),
                transforms.Normalize(
                    mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD
                ),
            ]
        )# 常见的图片预处理分法

def get_slice_image_placeholder(image, tokenizer, image_num=0):
    """
    输入：image 单张图片
         tokenizer 分词器
    输出：原始图片(宽高是14的倍数)和分块后的图片，图片的占位符
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    slice_images, final_placeholder = get_slice_image_placeholder(image, tokenizer)
    # [<PIL.Image.Image image mode=RGB size=518x392 at 0x7F024D82FB80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D305A80>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D82F310>]
    # <image><unk>*64</image><slice><unk>*64</slice><slice><unk>*64</slice>
    """
    # 这个函数里面得到的image_placeholder没有考虑<image_id>0</image_id> （即图片编号），其他的和autoprofessor都是一样的
    image_placeholder = (
        tokenizer.im_start
        + tokenizer.unk_token * image_feature_size
        + tokenizer.im_end
    )# <image>+<unk>*64+</image>
    slice_images = []
    source_image, patches, best_grid = slice_image(
        image,
        max_slice_nums,# 9
        scale_resolution,# 448
        patch_size,# 14
    )# 原始图片(宽高是14的倍数) patches（分好的块list） best_grid（得到的分割方式）
    # <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0> 
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    # [2,1]
    slice_images.append(source_image)# 将原始图片(宽高是14的倍数)放入slice_images
    final_placeholder = '<image_id>{}</image_id>'.format(image_num) + image_placeholder# 原始图片的占位符
    if len(patches) > 0:
        for i in range(len(patches)):
            for j in range(len(patches[i])):
                slice_images.append(patches[i][j])# 将分块后的图片放入slice_images
        final_placeholder += get_grid_placeholder(
            tokenizer, best_grid, image_feature_size
        )# 带有分割块标志的图片占位符
        # '<slice><unk>*64</slice><slice><unk>*64</slice>'
        # 注意这里图片行与行之间会用\n分开
    return slice_images, final_placeholder# 将原始图片(宽高是14的倍数)和分块后的图片放在slice_images中，占位符放在final_placeholder

def slice_image(image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
    """
    输入：image 单张图片
         max_slice_nums 最大的分块数量
         scale_resolution 每一块的分辨率
         patch_size 块大小
    输出：原始图片(宽高是14的倍数) patches（分好的块list） best_grid（得到的分割方式）
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    max_slice_nums=9
    scale_resolution = 448
    patch_size = 14
    source_image, patches, best_grid = slice_image(image, max_slice_nums, scale_resolution, patch_size)
    # <PIL.Image.Image image mode=RGB size=728x546 at 0x7F024D3B75B0> 
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    # [2,1]
    """
    original_size = image.size# 图像大小 667,500
    original_width, original_height = original_size
    log_ratio = math.log(original_width / original_height)# 0.288181947493432
    ratio = original_width * original_height / (scale_resolution * scale_resolution)# 1.6616509885204083
    multiple = min(math.ceil(ratio), max_slice_nums)# 2 得到理想分块数量
    source_image = None
    best_grid = None
    patches = []
    if multiple <= 1 or never_split:# 不需要分块，上采样
        # dont need to slice, upsample
        best_size = find_best_resize(
            original_size, scale_resolution, patch_size, allow_upscale=True
        )# patch_size的宽，patch_size的高
        source_image = image.resize(best_size, Image.Resampling.BICUBIC)# 调整大小
    else:
        candidate_split_grids_nums = []# 2 3 去掉不分块的，也不能超过最大分块数量
        for i in [multiple - 1, multiple, multiple + 1]:# 1 2 3
            if i == 1 or i > max_slice_nums:
                continue
            candidate_split_grids_nums.append(i)
        # source image, down-sampling and ensure divided by patch_size
        best_resize = find_best_resize(original_size, scale_resolution, patch_size)# patch_size的宽，patch_size的高
        source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)# 518,392
        candidate_grids = []
        # find best grid
        for split_grids_nums in candidate_split_grids_nums:# 2 3
        # 找到所有的分块可能
        # 比如6块可以是1-6，2-3，3-2，6-1
            m = 1
            while m <= split_grids_nums:
                if split_grids_nums % m == 0:
                    candidate_grids.append([m, split_grids_nums // m])
                m += 1
        # 找到 1-2，2-1，1-3，3-1四种可能分法 要用每种分法对应的分数决定取哪种分法
        best_grid = [1, 1]
        min_error = float("inf")
        for grid in candidate_grids:
            error = abs(log_ratio - math.log(grid[0] / grid[1]))# math.log(original_width / original_height)-math.log(m / n)
            if error < min_error:
                best_grid = grid
                min_error = error
        refine_size = get_refine_size(
            original_size, best_grid, scale_resolution, patch_size, allow_upscale=True
        )# 728，546
        refine_image = image.resize(refine_size, Image.Resampling.BICUBIC)# 728，546
        patches = split_to_patches(refine_image, best_grid)# [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    return source_image, patches, best_grid# 返回了原始图片(也是14的倍数) patches（分好的2*1块） best_grid（最好的分隔方式）

def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False):
    """
    输入：original_size 图片的原始尺寸
         grid 分块的分法 list
         scale_resolution 每一块的分辨率
         patch_size 块大小 
    输出：找到原始图片按照grid分块后应该对应的图像尺寸
    demo:
    original_size = 667,500
    grid = [2,1]
    scale_resolution = 448
    patch_size = 14
    best_length = get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=True)
    # 728，546
    """
    print (original_size, grid)
    width, height = original_size
    grid_x, grid_y = grid
    refine_width = ensure_divide(width, grid_x)# 668
    refine_height = ensure_divide(height, grid_y)# 500
    grid_width = refine_width / grid_x# 334
    grid_height = refine_height / grid_y# 500 找到每一块的宽和高
    best_grid_size = find_best_resize(
        (grid_width, grid_height),
        scale_resolution,
        patch_size,
        allow_upscale=allow_upscale,
    )# 364,546 注意这里allow_upscale=True
    refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)# 728，546
    return refine_size

def ensure_divide(length, patch_size):
    """
    输入：length 长度 
         patch_size 块大小 
    输出：找到离length最近的patch_size的倍数
    demo:
    length = 516
    patch_size = 14
    best_length = ensure_divide(length, patch_size)
    # 518
    """
    return max(round(length / patch_size) * patch_size, patch_size)


def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False):
    """
    输入：original_size 图片的原始尺寸
         scale_resolution 每一块的分辨率
         patch_size 块大小
    输出：patch_size的宽，patch_size的高
    demo:
    original_size = 667,500
    (best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
    # 518,392 -情况1
    original_size = 334,500
    (best_width, best_height) = find_best_resize(original_size, scale_resolution=448, patch_size=14, allow_upscale=False)
    # 364,546 -情况2
    """
    width, height = original_size
    if (width * height > scale_resolution * scale_resolution) or allow_upscale:
    # 情况1:原始图片比目标图片大：
    # 通过这样的方式找到的缩放比例使得原始图片能保持原始比例
    # 最接近scale_resolution * scale_resolution的缩放尺寸
    # 情况2:当allow_upscale为True时
    # 也会完成这一步，只是这里是放大图片
        r = width / height# 1.334
        height = int(scale_resolution / math.sqrt(r))# 387
        width = int(height * r)# 516
        """
        原始图片   width/height=1.334 
        缩放后的图片width/height=1.333且width*height= 199692
        scale_resolution*scale_resolution = 200704
        width / height = scale_resolution / math.sqrt(r) * r / (scale_resolution / math.sqrt(r)) = r
        height * width = scale_resolution / math.sqrt(r) * (scale_resolution / math.sqrt(r) * r) = scale_resolution*scale_resolution
        """
    best_width = ensure_divide(width, patch_size)# 518 是patch_size的倍数了
    best_height = ensure_divide(height, patch_size)# 392 是patch_size的倍数了
    return (best_width, best_height)


def split_to_patches(image, grid):
    """
    输入：image 原始图片按照grid分块后应该对应的图像尺寸放缩后的图片
         grid 分块的分法 list
    输出：裁剪后的图片list
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    refine_image = image.resize((728,546), Image.Resampling.BICUBIC)# 728，546
    grid = [2,1]
    patches = split_to_patches(refine_image, grid)
    # [[<PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7880>, <PIL.Image.Image image mode=RGB size=364x546 at 0x7F024D3B7520>]]
    """
    patches = []
    width, height = image.size
    grid_x = int(width / grid[0])# 728/2=364
    grid_y = int(height / grid[1])# 546/1=546
    for i in range(0, height, grid_y):
        images = []
        for j in range(0, width, grid_x):
            box = (j, i, j + grid_x, i + grid_y)
            patch = image.crop(box)# 按照box进行裁剪
            images.append(patch)
        patches.append(images)
    return patches# 裁剪后的图片

def get_grid_placeholder(tokenizer, grid, query_num):# 返回2*2的patch占位符，行与行之间需要用\n连接
    """
    输入：tokenizer 分词器
         grid 分块的分法 list
         query_num 占位符的个数
    输出：带有分割块标志的图片占位符
    demo:
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    grid = [2,2]
    query_num = 64
    slice_placeholder = get_grid_placeholder(tokenizer, grid, query_num)
    # '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
    """
    image_placeholder = (
        tokenizer.slice_start + tokenizer.unk_token * query_num + tokenizer.slice_end
    )# '<slice><unk>*64</slice>'
    cols = grid[0]# 2
    rows = grid[1]# 1
    slices = []
    for i in range(rows):
        lines = []
        for j in range(cols):
            lines.append(image_placeholder)
        slices.append("".join(lines))
    # ['<slice><unk>*64</slice><slice><unk>*64</slice>']
    slice_placeholder = "\n".join(slices)# 注意这里是将每行之间加了一个"\n"
    # '<slice><unk>*64</slice><slice><unk>*64</slice>\n<slice><unk>*64</slice><slice><unk>*64</slice>'
    return slice_placeholder

def reshape_by_patch(image_tensor, patch_size):
    """
    :param image_tensor: shape [3, H, W]
    :param patch_size:
    :return: [3, patch_size, HW/patch_size]
    demo:
    image = Image.open('bicycle.png').convert('RGB')
    patch_size = 14
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    slice_images, _ = get_slice_image_placeholder(image, tokenizer)
    slice_image = transform(slice_images[0])# [3, 392, 518]
    reshape_image = reshape_by_patch(slice_image, patch_size)# [3, 14, 14504]
    """
    patches = torch.nn.functional.unfold(
        image_tensor,
        (patch_size, patch_size),
        stride=(patch_size, patch_size)
    )# 将image_tensor按照patch_size，patch_size的块折叠 3*14*14，H*W/(14*14)
    patches = patches.reshape(image_tensor.size(0), patch_size, patch_size, -1)# 3,14,14,H*W/(14*14)
    patches = patches.permute(0, 1, 3, 2).reshape(image_tensor.size(0), patch_size, -1)# 3,14,H*W/14
    return patches# [3, patch_size, HW/patch_size]

from typing import List, Optional
def convert_to_tensors(tokenizer, input_ids, max_inp_length: Optional[int] = None):
    """
    输入：tokenizer 分词器
         input_ids 输入id
         max_inp_length 最大句子长度
    输出：通过input_ids返回了tensor后的input_ids和image_bound（图片的开始位置和结束位置）
    demo:
    tokenizer = AutoTokenizer.from_pretrained('OpenBMB/MiniCPM-V-2_6', trust_remote_code=True)
    max_inp_length = 250
    input_ids = [151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,151645,    198, 151644,    872,    198, 151658,     15, 151659, 151646,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 151647, 151656, 151646, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244, 128244,128244, 128244, 128244, 128244, 128244, 151647]
    model_input = convert_to_tensors(tokenizer, input_ids, max_inp_length)
    # 'input_ids','image_bound'
    """
    if max_inp_length is not None:
        input_ids = input_ids[:max_inp_length]
    input_ids = torch.tensor(input_ids, dtype=torch.int32)# [232]
    start_cond = (input_ids == tokenizer.im_start_id) | (input_ids == tokenizer.slice_start_id)
    end_cond = (input_ids == tokenizer.im_end_id) | (input_ids == tokenizer.slice_end_id)
    image_start_tokens = torch.where(start_cond)[0]# 找到图片开始的位置 tensor([ 17,  84, 150])
    # 跳过 im_start
    image_start_tokens += 1# 对应位置加1 tensor([ 18,  85, 151])
    image_end_tokens = torch.where(end_cond)[0]# 找到图片结束的位置 tensor([ 82, 149, 215])
    valid_image_nums = max(len(image_start_tokens), len(image_end_tokens))# 有效图片的数量 3
    image_bound = torch.hstack(
        [
            image_start_tokens[:valid_image_nums].unsqueeze(-1),
            image_end_tokens[:valid_image_nums].unsqueeze(-1),
        ]
    )# n*2 对应了n张图片的开始位置和结束位置
    model_input = {}
    model_input["input_ids"] = input_ids.unsqueeze(0)# 1*句子长度
    model_input["image_bound"] = image_bound
    return model_input# 返回了input_ids和image_bound（图片的开始位置和结束位置）


#max_inp_length = 250
#model_inputs = convert_to_tensors(tokenizer, input_ids, max_inp_length) # 'input_ids','image_bound'

主代码

目前这部分代码是从modeling_minicpmv.py拷贝过来的，只修改了函数chat的代码，修改部分的代码主要来自processing_minicpmv.pyopenbmb/MiniCPM-V-2_6
将下述代码保存为minicpm_v.py，就可以愉快的使用了


import math
from typing import List, Optional
import json
import torch
import torchvision

from threading import Thread
from copy import deepcopy
from PIL import Image
from transformers import AutoProcessor, Qwen2PreTrainedModel, Qwen2ForCausalLM, TextIteratorStreamer

from configuration_minicpm import MiniCPMVConfig
from modeling_navit_siglip import SiglipVisionTransformer
from resampler import Resampler
from image_processing import *


class MiniCPMVPreTrainedModel(Qwen2PreTrainedModel):
    config_class = MiniCPMVConfig


class MiniCPMV(MiniCPMVPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.llm = Qwen2ForCausalLM(config)
        self.vpm = self.init_vision_module()
        self.vision_dim = self.vpm.embed_dim
        self.embed_dim = self.llm.config.hidden_size
        self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
        self.processor = None

        self.terminators = ['<|im_end|>', '<|endoftext|>']

    def init_vision_module(self):
        # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
        if self.config._attn_implementation == 'flash_attention_2':
            self.config.vision_config._attn_implementation = 'flash_attention_2'
        else:
            # not suport sdpa
            self.config.vision_config._attn_implementation = 'eager'
        model = SiglipVisionTransformer(self.config.vision_config)
        if self.config.drop_vision_last_layer:
            model.encoder.layers = model.encoder.layers[:-1]

        setattr(model, 'embed_dim', model.embeddings.embed_dim)
        setattr(model, 'patch_size', model.embeddings.patch_size)

        return model

    def init_resampler(self, embed_dim, vision_dim):
        return Resampler(
            num_queries=self.config.query_num,
            embed_dim=embed_dim,
            num_heads=embed_dim // 128,
            kv_dim=vision_dim,
            adaptive=True
        )

    def get_input_embeddings(self):
        return self.llm.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.llm.embed_tokens = value

    def get_output_embeddings(self):
        return self.llm.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.llm.lm_head = new_embeddings

    def set_decoder(self, decoder):
        self.llm = decoder

    def get_decoder(self):
        return self.llm

    def get_vllm_embedding(self, data):
        if 'vision_hidden_states' not in data:
            dtype = self.llm.model.embed_tokens.weight.dtype
            device = self.llm.model.embed_tokens.weight.device
            tgt_sizes = data['tgt_sizes']
            pixel_values_list = data['pixel_values']
            vision_hidden_states = []
            all_pixel_values = []
            img_cnt = []
            for pixel_values in pixel_values_list:
                img_cnt.append(len(pixel_values))
                all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values])

            # exist image
            if all_pixel_values:
                tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
                tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)

                max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])

                all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True,
                                                                   padding_value=0.0)
                B, L, _ = all_pixel_values.shape
                all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)

                patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
                for i in range(B):
                    patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True

                vision_batch_size = self.config.vision_batch_size
                all_pixel_values = all_pixel_values.type(dtype)
                if B > vision_batch_size:
                    hs = []
                    for i in range(0, B, vision_batch_size):
                        start_idx = i
                        end_idx = i + vision_batch_size
                        tmp_hs = self.vpm(all_pixel_values[start_idx:end_idx], patch_attention_mask=patch_attn_mask[start_idx:end_idx], tgt_sizes=tgt_sizes[start_idx:end_idx]).last_hidden_state
                        hs.append(tmp_hs)
                    vision_embedding = torch.cat(hs, dim=0)
                else:
                    vision_embedding = self.vpm(all_pixel_values, patch_attention_mask=patch_attn_mask, tgt_sizes=tgt_sizes).last_hidden_state
                vision_embedding = self.resampler(vision_embedding, tgt_sizes)

                start = 0
                for pixel_values in pixel_values_list:
                    img_cnt = len(pixel_values)
                    if img_cnt > 0:
                        vision_hidden_states.append(vision_embedding[start: start + img_cnt])
                        start += img_cnt
                    else:
                        vision_hidden_states.append([])
            else: # no image
                if self.training:
                    dummy_image = torch.zeros(
                        (1, 3, 224, 224),
                        device=device, dtype=dtype
                    )
                    tgt_sizes = torch.Tensor([[(224 // self.config.patch_size), math.ceil(224 / self.config.patch_size)]]).type(torch.int32)
                    dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
                else:
                    dummy_feature = []
                for _ in range(len(pixel_values_list)):
                    vision_hidden_states.append(dummy_feature)

        else:
            vision_hidden_states = data['vision_hidden_states']

        if hasattr(self.llm.config, 'scale_emb'):
            vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
        else:
            vllm_embedding = self.llm.model.embed_tokens(data['input_ids'])

        vision_hidden_states = [i.type(vllm_embedding.dtype) if isinstance(
            i, torch.Tensor) else i for i in vision_hidden_states]

        bs = len(data['input_ids'])
        for i in range(bs):
            cur_vs_hs = vision_hidden_states[i]
            if len(cur_vs_hs) > 0:
                cur_vllm_emb = vllm_embedding[i]
                cur_image_bound = data['image_bound'][i]
                if len(cur_image_bound) > 0:
                    image_indices = torch.stack(
                        [torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]
                    ).to(vllm_embedding.device)

                    cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
                                          cur_vs_hs.view(-1, cur_vs_hs.shape[-1]))
                elif self.training:
                    cur_vllm_emb += cur_vs_hs[0].mean() * 0

        return vllm_embedding, vision_hidden_states

    def forward(self, data, **kwargs):
        vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data)
        position_ids = data["position_ids"]
        if position_ids.dtype != torch.int64:
            position_ids = position_ids.long()

        return self.llm(
            input_ids=None,
            position_ids=position_ids,
            inputs_embeds=vllm_embedding,
            **kwargs
        )
    
    def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs):
        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
        output = self.llm.generate(
            inputs_embeds=inputs_embeds,
            pad_token_id=0,
            eos_token_id=terminators,
            attention_mask=attention_mask,
            **kwargs
        )
        if decode_text:
            return self._decode_text(output, tokenizer)
        return output

    def _decode_stream(self, inputs_embeds, tokenizer, **kwargs):
        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
        streamer = TextIteratorStreamer(tokenizer=tokenizer)
        generation_kwargs = {
            'inputs_embeds': inputs_embeds,
            'pad_token_id': 0,
            'eos_token_id': terminators,
            'streamer': streamer
        }
        generation_kwargs.update(kwargs)

        thread = Thread(target=self.llm.generate, kwargs=generation_kwargs)
        thread.start()
    
        return streamer

    def _decode_text(self, result_ids, tokenizer):
        terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
        result_text = []
        for result in result_ids:
            result = result[result != 0]
            if result[0] == tokenizer.bos_id:
                result = result[1:]
            if result[-1] in terminators:
                result = result[:-1]
            result_text.append(tokenizer.decode(result).strip())
        return result_text

    def generate(
        self,
        input_ids=None,
        pixel_values=None,
        tgt_sizes=None,
        image_bound=None,
        attention_mask=None,
        tokenizer=None,
        vision_hidden_states=None,
        return_vision_hidden_states=False,
        stream=False,
        decode_text=False,
        **kwargs
    ):
        assert input_ids is not None
        assert len(input_ids) == len(pixel_values)

        model_inputs = {
            "input_ids": input_ids,
            "image_bound": image_bound,
        }

        if vision_hidden_states is None:
            model_inputs["pixel_values"] = pixel_values
            model_inputs['tgt_sizes'] = tgt_sizes
        else:
            model_inputs["vision_hidden_states"] = vision_hidden_states

        with torch.inference_mode():
            (
                model_inputs["inputs_embeds"],
                vision_hidden_states,
            ) = self.get_vllm_embedding(model_inputs)

            if stream:
                result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
            else:
                result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs)

        if return_vision_hidden_states:
            return result, vision_hidden_states
        
        return result

    def chat(
        self,
        msgs,
        tokenizer,
        max_new_tokens=2048,
        sampling=True,
        max_inp_length=8192,
        stream=False,
        use_image_id=None,
        **kwargs
    ):# 默认不以batch的形式进行chat
        msgs_list = [msgs]
        device = self.llm.model.embed_tokens.weight.device
        
        prompts_lists = []
        input_images_lists = []
        for msgs in msgs_list:
            copy_msgs = deepcopy(msgs)

            for i, msg in enumerate(copy_msgs):
                role = msg["role"]
                content = msg["content"]
                if i == 0:# 确保是用户首先说话
                    assert role == "user", "The role of first msg should be user"
                if isinstance(content, str):
                    content = [content]# 如果都是str，则也默认放在list中，方便后边循环
                image_num = 0
                images = []
                cur_msgs = []
                tgt_sizes = []
                
                for c in content:
                    if isinstance(c, Image.Image):
                        #images.append(c)# 将图片占据的位置改成(<image>./</image>)
                        #cur_msgs.append("(<image>./</image>)")
                        slice_images, image_placeholder = get_slice_image_placeholder(c, tokenizer, image_num)
                        image_num += 1
                        cur_msgs.append(image_placeholder)
                        for slice_image in slice_images:# 3个图片
                            slice_image = transform(slice_image)# [3, H, W]
                            H, W = slice_image.shape[1:]
                            images.append(reshape_by_patch(slice_image, patch_size).to(device))# [3, patch_size, HW/patch_size]
                            tgt_sizes.append(torch.Tensor([H // patch_size, W // patch_size]).type(torch.int32))# H/14,W/14
                    elif isinstance(c, str):
                        cur_msgs.append(c)
                msg["content"] = "\n".join(cur_msgs)
        input_ids = tokenizer.apply_chat_template(copy_msgs, tokenize=True, add_generation_prompt=True)# 包含了图像和文本


        if tgt_sizes:
            tgt_sizes = torch.vstack(tgt_sizes).to(device)# n*2

        model_inputs = convert_to_tensors(tokenizer, input_ids, max_inp_length)
        model_inputs['image_bound'] = [model_inputs['image_bound']]
        model_inputs['input_ids'] = model_inputs['input_ids'].to(device)
        model_inputs['tgt_sizes'] = [tgt_sizes]
        #model_inputs['image_sizes'] = torch.tensor(image.size)
        model_inputs['pixel_values'] = [images]
        model_inputs.pop('image_sizes')
        if sampling:
            generation_config = {
                "top_p": 0.8,
                "top_k": 100,
                "temperature": 0.7,
                "do_sample": True,
                "repetition_penalty": 1.05
            }
        else:
            generation_config = {
                "num_beams": 3,
                "repetition_penalty": 1.2,
            }

        generation_config.update(
            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
        )
        res = self.generate(**model_inputs, 
                            tokenizer=tokenizer, 
                            max_new_tokens=max_new_tokens, 
                            stream=stream,
                            decode_text=True,
                            **generation_config)
        
        if stream:
            def stream_gen():
                for text in res:
                    for term in self.terminators:
                        text = text.replace(term, '')
                    yield text
            return stream_gen()

        else:
            answer = res[0]
            return answer

使用下述代码调用就可以愉快的玩耍了

import torch
from PIL import Image
from transformers import AutoTokenizer
from minicpm_v import *

model_path = '/usr/downloads/MiniCPM-V-2_6'

model = MiniCPMV.from_pretrained(model_path, trust_remote_code=True,
    attn_implementation='sdpa', torch_dtype=torch.bfloat16) 

model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 单张图像
image = Image.open(model_path + '/assets/radar_final.png').convert('RGB')
question = 'What is in the image?'
msgs = [{'role': 'user', 'content': [image,question]}]
res = model.chat(image=None, msgs=msgs, tokenizer=tokenizer,sampling=False)# 这样采样结果就是一样的了
print(res)