Qwen2&2.5
概述
阿里推出的Qwen2.5系列仍然是目前最火的BASE模型,相比于Qwen2系列,2.5只是使用了体量更大、质量更高的数据集继续训练得到的 Qwen2.5 系列模型。
Code:https://github.com/QwenLM
Qwen系列包括了5个尺寸的预训练和指令微调的模型,当时在开源的模型里一经推出就达到SOTA的水平。其中全系都使用了GQA(Group-Query Attention)技术,小模型使用了Tie Embedding(共享向量)技术。上下文长度都是基于32K的数据预训练然后拓展的,可以看到最长可以支持到128K。在推出的当时性能还是吊打其他大部分大模型的,尤其是多语言支持以及代码及数学能力显著提升,具体性能指标可以参考其文档。

OCR调用
为了对比两代Qwen模型的OCR能力,我们通常需要设计prompt来导入图片或者pdf进行批量识别。而在批量识别的时候我遇到了内存溢出问题,在修复后我将使用的prompt和代码提供给大家进行参考,本文使用的模型采用7B(因为3090只能跑7B),为了一致性,本文采用的四个模型prompt都是相同的。模型下载的链接如下:
Qwen2.5模型:https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct
Qwen2模型:https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
由于huggingface内网无法访问,因此需要用国内的代理网站进行下载,代码如下:
import os
# // 设置环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# // 下载模型 Qwen2.5 其他模型同理
os.system('huggingface-cli download Qwen/Qwen2.5-VL-7B-Instruct \#连接
--local-dir ./Qwen2.5-VL-7B-Instruct \#本地目录
--local-dir-use-symlinks False')
import dashscope
import json
import os
from tqdm import tqdm
from transformers import set_seed
import os
from transformers import Qwen2_5VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
from modelscope import snapshot_download
import torch
import time
import gc
import sys
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
def clean_memory():
"""显存清理函数"""
torch.cuda.synchronize() # 等待所有CUDA操作完成
torch.cuda.empty_cache()
gc.collect()
time.sleep(1) # 增加清理间隔
def set_seed(seed):
import random
import numpy as np
import torch
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(seed)
def load_model():
"""带内存优化的模型加载"""
model_dir = "/Qwen2.5-VL-7B-Instruct"#模型保存路径
# 显存清理确保加载顺利
clean_memory()
model = AutoModelForVision2Seq.from_pretrained(
model_dir,
torch_dtype=torch.bfloat16,
device_map="cuda:2",
trust_remote_code=True
)
processor = AutoProcessor.from_pretrained(model_dir)
return model, processor
# 初始化模型(全局加载一次)
model, processor = load_model()
# 设置输入和输出的基目录
input_dir = '/images'
output_dir = "/result/"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# complex prompt
prompt = r'''You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
1. Text Processing:
- Accurately recognize all text content in the PDF image without guessing or inferring.
- Convert the recognized text into Markdown format.
- Maintain the original document structure, including headings, paragraphs, lists, etc.
2. Mathematical Formula Processing:
- Convert all mathematical formulas to LaTeX format.
- Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
- Enclose block formulas with \\[ \\]. For example: \[ \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]
3. Table Processing:
- Convert tables to HTML format.
- Wrap the entire table with <table> and </table>.
4. Figure Handling:
- Ignore figures content in the PDF image. Do not attempt to describe or convert images.
5. Output Format:
- Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
- For complex layouts, try to maintain the original document's structure and format as closely as possible.
Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
'''
image_extensions = ('.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp')
current_time_1 = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
print(current_time_1)
def process_single_image(image_path, output_dir):
"""处理单个图像并自动清理内存"""
# 初始化所有可能用到的变量
inputs = None
generated_ids = None
generated_ids_trimmed = None
output_text = None
try:
# 生成输出路径
basename = os.path.splitext(os.path.basename(image_path))[0]
markdown_file = os.path.join(output_dir, f"{basename}.md")
if os.path.exists(markdown_file):
# print(f"跳过已存在文件: {markdown_file}")
return True
# 构建输入数据
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image_path, "max_pixels": 1080 * 1080},
{"type": "text", "text": prompt}
]
}]
# 预处理
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
# 使用显存上下文管理
with torch.no_grad():
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt"
).to("auto")
# 生成输出
set_seed(0)
generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
temperature=0.1,
do_sample=False
)
# 后处理
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
# 保存结果
with open(markdown_file, 'w', encoding='utf-8') as f:
f.write(output_text)
return True
except Exception as e:
print(f"处理 {image_path} 时发生错误: {str(e)}")
return False
finally:
# 安全释放内存(带存在性检查)
for var in [inputs, generated_ids, generated_ids_trimmed, output_text]:
if var is not None:
del var
clean_memory()
if __name__ == "__main__":
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 获取所有需要处理的文件
file_list = []
for root, _, files in os.walk(input_dir):
for name in files:
if any(name.lower().endswith(ext) for ext in image_extensions):
file_list.append(os.path.join(root, name))
print(f"开始处理 {len(file_list)} 个文件")
start_time = time.time()
# 带进度条的遍历处理
for image_path in tqdm(file_list, desc="处理进度"):
success = False
retry = 0
while not success and retry < 3: # 最多重试3次
success = process_single_image(image_path, output_dir)
if not success:
retry += 1
print(f"第 {retry} 次重试: {image_path}")
time.sleep(2**retry) # 指数退避重试
if not success:
print(f"!! 文件处理失败: {image_path}")
# 计时信息
end_time = time.time()
print(f"\n开始时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(start_time))}")
print(f"结束时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(end_time))}")
print(f"总耗时: {end_time - start_time:.2f} 秒")
由于Qwen2和Qwen2.5的模型调用并不相同,因此模型调用和import部分需要改为:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_dir,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2"
)
书生大模型InternVL系列
概述
当前主流的 MLLMs——无论开源还是闭源——大多仍沿袭一种“后处理”的构建模式:先训练一个纯文本大语言模型(LLM),再通过复杂的、往往是多阶段的流程将其适配为能处理视觉输入的 MLLM。这种模式虽然有效,却带来了模态间对齐的固有挑战,并通常需要大量特定领域的辅助数据以及精细的参数冻结或多阶段微调策略,资源密集且效率有限。
正是在这样的背景下,InternVL 系列模型应运而生,并不断在多模态能力的探索上迈进。如今,研究人员带来了该系列的最新一代:InternVL3。它打破了传统的“后处理”藩篱,开创性地采用了原声多模态预训练范式,旨在从根本上提升模型的效率和性能。InternVL3 的目标不仅仅是追赶甚至超越现有的顶尖开源 MLLMs,更是剑指与领先的闭源商业模型同台竞技,同时秉持开放科学原则,推动下一代 MLLMs 的发展。

论文地址:https://arxiv.org/abs/2504.10479
Code:https://github.com/OpenGVLab/InternVL
InternVL3-8B:https://huggingface.co/OpenGVLab/InternVL3-8B
InternVL2-8B:https://huggingface.co/OpenGVLab/InternVL2-8B
数据集:https://huggingface.co/datasets/OpenGVLab/InternVL-Data
模型架构
InternVL3系列模型的架构延续了“ViT-MLP-LLM”范式,用随机初始化的MLP projector将全新增量预训练的InternViT与各种预训练的LLM(包括InternLM3和Qwen2.5)集成。
在模型推理方面,InternVL3应用了像素反混洗操作,将视觉标记数量减少到原来的四分之一,并采用动态分辨率策略,将图像划分为448×448像素的图块。从InternVL2.0开始,关键区别在于额外引入了对多图像和视频数据的支持。InternVL3还集成了可变视觉位置编码(V2PE),为视觉标记提供了更小、更灵活的位置增量,从而展现出更出色的长上下文理解能力。
在模型部署方面,InternVL3可以通过LMDeploy的api_server部署成OpenAI兼容API,用户只需安装lmdeploy>=0.7.3,然后使用相关命令即可完成部署。模型调用时,用户可以通过OpenAI的API接口,指定模型名称、消息内容等参数,获取模型的响应。

OCR调用
InternVl系列在上传并调用图片时有自己独特的逻辑,详情可以参照模型官网的介绍文档。这里直接放省流一键运行版代码,注意替换本地模型路径,下载方式和之前Qwen的huggingface下载方式相同:
from transformers import AutoTokenizer, AutoModel
import torch
import torchvision.transforms as T
import copy
import os
import json
import json
import numpy as np
from tqdm import tqdm
import random
import math
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=6):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
if __name__ == "__main__":
random.seed(0)
path = "./InternVL2-8B"#模型路径
img_path = "/images"#图片路径
save_path = "/result"#结果路径
os.makedirs(save_path, exist_ok=True) # 新增目录创建语句
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
generation_config = dict(max_new_tokens=4096, do_sample=False, temperature=0.0, no_repeat_ngram_size=20)
prompt = r'''You are an AI assistant specialized in converting PDF images to Markdown format. Please follow these instructions for the conversion:
1. Text Processing:
- Accurately recognize all text content in the PDF image without guessing or inferring.
- Convert the recognized text into Markdown format.
- Maintain the original document structure, including headings, paragraphs, lists, etc.
2. Mathematical Formula Processing:
- Convert all mathematical formulas to LaTeX format.
- Enclose inline formulas with \( \). For example: This is an inline formula \( E = mc^2 \)
- Enclose block formulas with \\[ \\]. For example: \[ \frac{-b \pm \sqrt{b^2 - 4ac}}{2a} \]
3. Table Processing:
- Convert tables to HTML format.
- Wrap the entire table with <table> and </table>.
4. Figure Handling:
- Ignore figures content in the PDF image. Do not attempt to describe or convert images.
5. Output Format:
- Ensure the output Markdown document has a clear structure with appropriate line breaks between elements.
- For complex layouts, try to maintain the original document's structure and format as closely as possible.
Please strictly follow these guidelines to ensure accuracy and consistency in the conversion. Your task is to accurately convert the content of the PDF image into Markdown format without adding any extra explanations or comments.
'''
question = f'<image>\n{prompt}'
for img_name in tqdm(os.listdir(img_path)):
# 清洗文件名并检查扩展名
img_name_clean = img_name.strip()
if not (img_name_clean.endswith('.jpg') or img_name_clean.endswith('.png')):
continue
# 构造输出路径并检查是否存在
md_filename = img_name_clean[:-4] + '.md'
md_path = os.path.join(save_path, md_filename)
if os.path.exists(md_path):
print(f"Skipping {img_name} as {md_filename} already exists.")
continue
# 处理图像
try:
img_path_tmp = os.path.join(img_path, img_name) # 使用原始文件名访问文件
pixel_values = load_image(img_path_tmp, max_num=6).to(torch.bfloat16).cuda()
response = model.chat(tokenizer, pixel_values, question, generation_config)
# 保存结果
with open(md_path, 'w', encoding='utf-8') as output_file:
output_file.write(response)
except Exception as e:
print(f"Error processing {img_name}: {str(e)}")
continue
1007

被折叠的 条评论
为什么被折叠?



