探索Intel Weight-Only量化:提升Hugging Face模型运行效率

引言

在机器学习领域,模型的大小和推理效率一直是开发者关注的重点。为了提升模型运行效率,特别是在资源受限的设备上,量化技术逐渐成为一种重要的策略。本文将介绍如何使用Intel Extension for Transformers中的Weight-Only Quantization技术,通过量化Hugging Face模型权重来提升推理效率。

主要内容

量化基础

量化是一种将模型参数从浮点数转换为低精度整数的技术,以降低模型大小和提高计算效率。Intel Extension for Transformers提供了多种量化数据类型,如int8int4nf4等,支持在CPU上高效执行推理任务。

安装依赖

在开始之前,请确保安装必要的Python包:

%pip install transformers --quiet
%pip install intel-extension-for-transformers

模型加载

我们可以通过WeightOnlyQuantPipeline类加载模型。设置量化配置后,从模型ID加载模型:

from intel_extension_for_transformers.transformers im
INFO 07-25 07:11:43 [model_runner_v1.py:1745] Starting to load model /models/z50051264/summary/Qwen2.5-7B-nf4/... ERROR 07-25 07:11:44 [core.py:586] EngineCore failed to start. ERROR 07-25 07:11:44 [core.py:586] Traceback (most recent call last): ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core ERROR 07-25 07:11:44 [core.py:586] engine_core = EngineCoreProc(*args, **kwargs) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in __init__ ERROR 07-25 07:11:44 [core.py:586] super().__init__(vllm_config, executor_class, log_stats, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 75, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.model_executor = executor_class(vllm_config) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/executor_base.py", line 53, in __init__ ERROR 07-25 07:11:44 [core.py:586] self._init_executor() ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 48, in _init_executor ERROR 07-25 07:11:44 [core.py:586] self.collective_rpc("load_model") ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 57, in collective_rpc ERROR 07-25 07:11:44 [core.py:586] answer = run_method(self.driver_worker, method, args, kwargs) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/utils/__init__.py", line 2736, in run_method ERROR 07-25 07:11:44 [core.py:586] return func(*args, **kwargs) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 240, in load_model ERROR 07-25 07:11:44 [core.py:586] self.model_runner.load_model() ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1748, in load_model ERROR 07-25 07:11:44 [core.py:586] self.model = get_model(vllm_config=self.vllm_config) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 59, in get_model ERROR 07-25 07:11:44 [core.py:586] return loader.load_model(vllm_config=vllm_config, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model ERROR 07-25 07:11:44 [core.py:586] model = initialize_model(vllm_config=vllm_config, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model ERROR 07-25 07:11:44 [core.py:586] return model_class(vllm_config=vllm_config, prefix=prefix) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 448, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.model = Qwen2Model(vllm_config=vllm_config, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__ ERROR 07-25 07:11:44 [core.py:586] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 317, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.start_layer, self.end_layer, self.layers = make_layers( ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 639, in make_layers ERROR 07-25 07:11:44 [core.py:586] [PPMissingLayer() for _ in range(start_layer)] + [ ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 640, in <listcomp> ERROR 07-25 07:11:44 [core.py:586] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 319, in <lambda> ERROR 07-25 07:11:44 [core.py:586] lambda prefix: decoder_layer_type(config=config, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 216, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.self_attn = Qwen2Attention( ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 137, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.qkv_proj = QKVParallelLinear( ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 874, in __init__ ERROR 07-25 07:11:44 [core.py:586] super().__init__(input_size=input_size, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 420, in __init__ ERROR 07-25 07:11:44 [core.py:586] super().__init__(input_size, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 266, in __init__ ERROR 07-25 07:11:44 [core.py:586] self.quant_method = quant_config.get_quant_method(self, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 92, in get_quant_method ERROR 07-25 07:11:44 [core.py:586] if self.is_layer_skipped_ascend(prefix, ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 126, in is_layer_skipped_ascend ERROR 07-25 07:11:44 [core.py:586] is_shard_skipped = self.quant_description[shard_prefix + ERROR 07-25 07:11:44 [core.py:586] KeyError: 'model.layers.0.self_attn.q_proj.weight' Process EngineCore_0: Traceback (most recent call last): File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 590, in run_engine_core raise e File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core engine_core = EngineCoreProc(*args, **kwargs) File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in __init__ super().__init__(vllm_config, executor_class, log_stats, File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 75, in __init__ self.model_executor = executor_class(vllm_config) File "/vllm-workspace/vllm/vllm/executor/executor_base.py", line 53, in __init__ self._init_executor() File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 48, in _init_executor self.collective_rpc("load_model") File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 57, in collective_rpc answer = run_method(self.driver_worker, method, args, kwargs) File "/vllm-workspace/vllm/vllm/utils/__init__.py", line 2736, in run_method return func(*args, **kwargs) File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 240, in load_model self.model_runner.load_model() File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1748, in load_model self.model = get_model(vllm_config=self.vllm_config) File "/vllm-workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 59, in get_model return loader.load_model(vllm_config=vllm_config, File "/vllm-workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model model = initialize_model(vllm_config=vllm_config, File "/vllm-workspace/vllm/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model return model_class(vllm_config=vllm_config, prefix=prefix) File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 448, in __init__ self.model = Qwen2Model(vllm_config=vllm_config, File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__ old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 317, in __init__ self.start_layer, self.end_layer, self.layers = make_layers( File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 639, in make_layers [PPMissingLayer() for _ in range(start_layer)] + [ File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 640, in <listcomp> maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 319, in <lambda> lambda prefix: decoder_layer_type(config=config, File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 216, in __init__ self.self_attn = Qwen2Attention( File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 137, in __init__ self.qkv_proj = QKVParallelLinear( File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 874, in __init__ super().__init__(input_size=input_size, File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 420, in __init__ super().__init__(input_size, File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 266, in __init__ self.quant_method = quant_config.get_quant_method(self, File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 92, in get_quant_method if self.is_layer_skipped_ascend(prefix, File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 126, in is_layer_skipped_ascend is_shard_skipped = self.quant_description[shard_prefix + KeyError: 'model.layers.0.self_attn.q_proj.weight' Traceback (most recent call last): File "/usr/local/python3.10.17/lib/python3.10/runpy.py", line 196, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/local/python3.10.17/lib/python3.10/runpy.py", line 86, in _run_code exec(code, run_globals) File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1495, in <module> uvloop.run(run_server(args)) File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/__init__.py", line 82, in run return loop.run_until_complete(wrapper()) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/__init__.py", line 61, in wrapper return await main File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1431, in run_server await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1451, in run_server_worker async with build_async_engine_client(args, client_config) as engine_client: File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in __aenter__ return await anext(self.gen) File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 158, in build_async_engine_client async with build_async_engine_client_from_engine_args( File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in __aenter__ return await anext(self.gen) File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 194, in build_async_engine_client_from_engine_args async_llm = AsyncLLM.from_vllm_config( File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 162, in from_vllm_config return cls( File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 124, in __init__ self.engine_core = EngineCoreClient.make_async_mp_client( File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 96, in make_async_mp_client return AsyncMPClient(*client_args) File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 666, in __init__ super().__init__( File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 403, in __init__ with launch_core_engines(vllm_config, executor_class, File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 142, in __exit__ next(self.gen) File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 434, in launch_core_engines wait_for_engine_startup( File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 484, in wait_for_engine_startup raise RuntimeError("Engine core initialization failed. " RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} [ERROR] 2025-07-25-07:11:52 (PID:1889, Device:-1, RankID:-1) ERR99999 UNKNOWN applicaiton exception 这是怎么回事儿???我启动没量化的版本是正常的,但是启动量化模型出现上述错误
07-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值