1 图片推理
from PIL import Image
import requests
import torch
from torchvision import io
from typing import Dict
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
model = Qwen2VLForConditionalGeneration.from_pretrained(
"/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct",
torch_dtype="auto",
device_map="auto"
)
processor = AutoProcessor.from_pretrained("/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-2B-Instruct")
image = Image.open("/home/fyo/Pictures/earthquake.jpg")
conversation = [
{
"role": "user",
"content": [
{
"type": "image",
},
{"type": "text", "text": "描述这张图."},
],
}
]
text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(
text=[text_prompt], images=[image], padding=True, return_tensors="pt"
)
inputs = inputs.to("cuda")
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [
output_ids[len(input_ids) :]
for input_ids, output_ids in zip(inputs.input_ids, output_ids)
]
output_text = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
print(output_text)
2 视频推理
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
model_name = "/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype="auto",
device_map="auto",
)
processor = AutoProcessor.from_pretrained(
model_name
)
def chat_with_video(file_name, query, video_width, video_height, fps=1.0):
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": file_name,
"max_pixels": video_width * video_height,
"fps": fps,
},
{"type": "text", "text": query},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=1500)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return output_text
output_text = chat_with_video("/home/fyo/VisionLLM/downloads/3.mp4", "这个视频展示了什么?请描述所有细节", 734, 576,fps=0.5)
print(output_text)
3 屏幕推理
from transformers import AutoModelForVision2Seq, AutoProcessor
from qwen_vl_utils import process_vision_info
from PIL import Image
import torch
import mss
import mss.tools
import time
model_name = "/home/fyo/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct"
model = AutoModelForVision2Seq.from_pretrained(model_name,torch_dtype=torch.float16)
processor = AutoProcessor.from_pretrained(
model_name,
min_pixels=256 * 28 * 28,
max_pixels=2000 * 28 * 28
)
model.eval()
def capture_screen_region():
with mss.mss() as sct:
monitor = {
"top": 0,
"left": 0,
"width": 1920,
"height": 1440,
}
screenshot = sct.grab(monitor)
img = Image.frombytes('RGB', screenshot.size, screenshot.rgb)
return img
def describe_image(image):
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe this image in Chinese."},
],
}
]
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info(messages)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = processor(
text=[text],
images=image_inputs,
padding=True,
return_tensors="pt"
)
inputs = inputs.to(device)
with torch.no_grad():
model.to(device)
outputs = model.generate(**inputs, max_new_tokens=128)
description = processor.batch_decode(outputs, skip_special_tokens=True)
return description[0]
def real_time_description(interval=1):
while True:
img = capture_screen_region()
description = describe_image(img)
print(f"Description: {description}")
time.sleep(interval)
real_time_description()