OpenVINO GenAI 是英特尔推出的生成式 AI 开发工具库,基于 OpenVINO Runtime 构建,专为在英特尔 CPU、GPU、NPU 等硬件上高效部署生成式模型而设计。它通过简化开发流程、优化推理性能,帮助开发者快速实现文本生成、图像创作、语音交互等多模态应用。
安装依赖
安装使用 OpenVINO GenAI 所需的软件包。
pip install openvino-genai
下载模型
Qwen3-14B-INT4-OV模型下载地址:
https://www.modelscope.cn/models/OpenVINO/Qwen3-14B-int4-ov
模型量化参数:
权重压缩使用 nncf.compress_weights 进行,参数如下:
-
mode: INT4_ASYM
-
ratio: 0.8
-
group_size: 128
兼容性:
-
OpenVINO 版本 2025.1.0 及以上
模型推理
查看GPU支持的优化能力:
import openvino.runtime as ov
core = ov.Core()
# 查询 GPU 支持的优化能力 (精度)
capabilities = core.get_property("GPU", "OPTIMIZATION_CAPABILITIES")
print(f"Intel Xe GPU 支持的精度能力: {capabilities}")
查看可用的推理设备:
import openvino.runtime as ov
core = ov.Core()
print(core.available_devices)
# 打印设备的详细信息
for device in core.available_devices:
if "GPU" in device.upper():
print(f"GPU 设备 ID: {device}")
print(f"GPU 设备名称: {core.get_property(device, 'FULL_DEVICE_NAME')}")
运行模型推理:
import openvino_genai as ov_genai
model_path = "D:/models/Qwen3-14B-int4-ov"
device = "CPU"
pipe = ov_genai.LLMPipeline(model_path, device)
pipe.get_tokenizer().set_chat_template(pipe.get_tokenizer().chat_template)
print(pipe.generate("What is OpenVINO?", max_length=200))
计算首token生成时间(TTFT)和推理速度:
import openvino_genai as ov_genai
import time
from prompt_config import make_prompt
prompt = make_prompt()
model_path = "D:/models/Qwen3-14B-int4-ov"
device = "GPU"
pipe = ov_genai.LLMPipeline(model_path, device)
tok = pipe.get_tokenizer()
tok.set_chat_template(tok.chat_template)
# 计算输入token数
def count_input_tokens(prompt_obj):
try:
enc_in = tok.encode(prompt_obj, add_special_tokens=True)
ids_in = getattr(enc_in, "input_ids", None)
if ids_in is not None:
arr = ids_in.data
return int(arr.shape[-1]) if arr.ndim > 1 else int(arr.shape[0])
except Exception:
pass
return max(1, len(str(prompt_obj).split()))
# (可选)预热:避免首次编译/加载干扰 TTFT
pipe.generate("warmup", max_new_tokens=1)
# 先统计输入token长度
input_tokens = count_input_tokens(prompt)
# 流式统计TTFT
first_token_ms = [None] # 闭包里保存首 token 到达时间(毫秒)
t0 = time.perf_counter()
def on_text(chunk: str):
if first_token_ms[0] is None and chunk:
first_token_ms[0] = (time.perf_counter() - t0) * 1000.0
print(chunk, end="", flush=True)
return False # 继续流式生成
streamer = ov_genai.TextStreamer(tok, on_text)
print("\n=== Streaming output ===")
t0 = time.perf_counter()
text_out = pipe.generate(prompt, max_new_tokens=1024, streamer=streamer)
t1 = time.perf_counter()
print("\n=== End of streaming ===\n")
# ========= 统计生成 token 数(输出)=========
gen_tokens = None
enc = tok.encode(text_out, add_special_tokens=False) # 仅统计新生成文本,不含特殊符号
ids = getattr(enc, "input_ids", None) # ov.Tensor
if ids is not None:
arr = ids.data # numpy array
gen_tokens = int(arr.shape[-1]) if arr.ndim > 1 else int(arr.shape[0])
# ========= 计算与打印指标 =========
total_elapsed_s = t1 - t0
ttft_ms = first_token_ms[0] if first_token_ms[0] is not None else float("nan")
# 从首 token 之后的吞吐(更贴近推理阶段性能)
after_first_s = total_elapsed_s - (0 if ttft_ms != ttft_ms else ttft_ms / 1000.0)
after_first_s = max(1e-9, after_first_s)
tps_overall = gen_tokens / max(1e-9, total_elapsed_s)
tps_after_first = gen_tokens / after_first_s
total_tokens = input_tokens + gen_tokens
print("=== Tokens ===")
print(f"Prompt tokens (input): {input_tokens}")
print(f"New tokens (output): {gen_tokens}")
print(f"Total tokens (prompt + output): {total_tokens}")
print("\n=== Perf (manual) ===")
print(f"TTFT: {ttft_ms:.2f} ms" if ttft_ms == ttft_ms else "TTFT: N/A")
print(f"Throughput (overall): {tps_overall:.2f} tokens/s")
print(f"Throughput (after first token): {tps_after_first:.2f} tokens/s")
print(f"Generated tokens: {gen_tokens}")
print(f"Total elapsed: {total_elapsed_s*1000:.2f} ms")
538

被折叠的 条评论
为什么被折叠?



