测试Qwen2-VL-2B-Instruct

1 图片推理

# 导入所需的库
from PIL import Image 
import requests 
import torch 
from torchvision import io  # PyTorch的计算机视觉工具包
from typing import Dict  # 用于类型注解
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor  # Hugging Face的transformers库,用于加载和使用预训练模型

# # 加载模型,使用半精度浮点数,自动选择可用设备
# model = Qwen2VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
# )
# # 加载处理器,用于预处理输入数据
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")

model = Qwen2VLForConditionalGeneration.from_pretrained(
    "/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct",#改为你本地的下载路径
    torch_dtype="auto",
     device_map="auto"
)
processor = AutoProcessor.from_pretrained("/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct")#改为你本地的下载路径

# # 设置图像URL
# url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
# image = Image.open(requests.get(url, stream=True).raw)

image = Image.open("/home/fyo/Pictures/earthquake.jpg")


# 构建对话结构,包含用户角色、图像和文本提示
conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
            },
            {"type": "text", "text": "描述这张图."},
        ],
    }
]

# 使用处理器应用聊天模板,生成文本提示
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

# 预处理输入数据,将文本和图像转换为模型可接受的格式
inputs = processor(
    text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")  # 将输入数据移至GPU(如果可用)

# 使用模型生成输出
output_ids = model.generate(**inputs, max_new_tokens=128)

# 提取生成的新token(去除输入部分)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]

# 解码生成的token为可读文本
output_text = processor.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

# 打印生成的文本
print(output_text)

2 视频推理

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
 
model_name = "/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct"
 
model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_name,
     torch_dtype="auto",
    #  torch_dtype=torch.float16,
    
     ##attn_implementation="flash_attention_2", #use flash-attention2 if your gpu card supports it (Free Colab's T4 does not support it)
     device_map="auto",
)
processor = AutoProcessor.from_pretrained(
    model_name
)

def chat_with_video(file_name, query, video_width, video_height, fps=1.0):
 
    messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": file_name,
                        "max_pixels": video_width * video_height,
                        "fps": fps,
                    },
                    {"type": "text", "text": query},
                ],
            }
        ]
  
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )

    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=1500)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    return output_text
  
 
output_text = chat_with_video("/home/fyo/VisionLLM/downloads/3.mp4", "这个视频展示了什么?请描述所有细节", 734, 576,fps=0.5)
print(output_text)

3 屏幕推理

from transformers import AutoModelForVision2Seq, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
import mss
import mss.tools
import time

# 加载 Qwen2-VL 模型和处理器
model_name = "/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct"  # 替换为本地模型路径
# model = AutoModelForVision2Seq.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(model_name,torch_dtype=torch.float16)

processor = AutoProcessor.from_pretrained(
    model_name,
    min_pixels=256 * 28 * 28,  # 最小像素数
    max_pixels=2000 * 28 * 28  # 最大像素数
)

# 将模型设置为评估模式
model.eval()

# 捕获屏幕左上角 500x500 像素区域
def capture_screen_region():
    with mss.mss() as sct:
        monitor = {
            "top": 0,
            "left": 0,
            "width": 1920,
            "height": 1440,
        }
        screenshot = sct.grab(monitor)
        img = Image.frombytes('RGB', screenshot.size, screenshot.rgb)
        return img

# 使用 Qwen2-VL 模型描述图像
def describe_image(image):
    # 预处理图像和文本
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": "Describe this image in Chinese."},
            ],
        }
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info(messages)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt"
    )
    # inputs = inputs.to("cuda")
    inputs = inputs.to(device)
    
    
        #     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # # 将模型移动到设备
        # model.to(device)

        # # 将输入数据移动到设备
        # inputs = inputs.to(device)


    # 使用模型生成描述
    with torch.no_grad():
        model.to(device)

        outputs = model.generate(**inputs, max_new_tokens=128)
    
    # 解码生成的描述
    description = processor.batch_decode(outputs, skip_special_tokens=True)
    return description[0]

# 实时描述屏幕区域内容
def real_time_description(interval=1):
    while True:
        # 捕获屏幕左上角 500x500 像素区域
        img = capture_screen_region()
        
        # 使用 Qwen2-VL 模型进行描述
        description = describe_image(img)
        print(f"Description: {description}")
        
        # 等待一段时间后继续
        time.sleep(interval)

# 开始实时描述
real_time_description()
### Qwen2-VL-2B-Instruct 模型介绍 Qwen2-VL-2B-Instruct 是一款端侧多模态大模型,旨在处理复杂的视觉和语言任务。该模型具有出色的性能,在多个基准测试中表现出色,适用于多种应用场景。此模型基于开源协议发布,允许开发者自由下载和使用[^1]。 #### 下载地址 对于希望获取并尝试这一强大工具的开发人员来说,可以通过以下链接访问官方提供的下载页面:[Qwen2-VL-2B-Instruct](https://hf-mirror.com/Qwen/Qwen2-VL-2B-Instruct),无需担心网络限制问题。 ### 使用说明 为了帮助用户更好地理解和利用这个模型,下面提供了一些基本的操作指南: #### 安装依赖库 首先确保环境中已安装必要的Python包。可以使用pip来完成这些软件包的安装工作: ```bash pip install transformers torch safetensors ``` #### 加载预训练模型 加载本地存储的预训练权重文件到内存中以便后续推理操作: ```python from transformers import AutoModelForVision2Seq, AutoProcessor processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") model = AutoModelForVision2Seq.from_pretrained("Qwen/Qwen2-VL-2B-Instruct") ``` #### 推理过程示例 这里给出一段简单的代码片段用于展示如何输入图像数据给定提示词从而获得相应的文本描述结果: ```python import requests from PIL import Image from io import BytesIO url = "http://example.com/image.jpg" response = requests.get(url) image = Image.open(BytesIO(response.content)) prompt = "Describe this image." inputs = processor(images=image, text=prompt, return_tensors="pt") outputs = model.generate(**inputs) generated_text = processor.decode(outputs[0], skip_special_tokens=True) print(generated_text) ``` 上述脚本实现了从指定URL读取图片资源,并将其传递给已经初始化好的`Qwen2-VL-2B-Instruct`实例进行分析处理;最终输出由AI生成的文字解释内容。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值