INFO 07-25 07:11:43 [model_runner_v1.py:1745] Starting to load model /models/z50051264/summary/Qwen2.5-7B-nf4/...
ERROR 07-25 07:11:44 [core.py:586] EngineCore failed to start.
ERROR 07-25 07:11:44 [core.py:586] Traceback (most recent call last):
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core
ERROR 07-25 07:11:44 [core.py:586] engine_core = EngineCoreProc(*args, **kwargs)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in __init__
ERROR 07-25 07:11:44 [core.py:586] super().__init__(vllm_config, executor_class, log_stats,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 75, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.model_executor = executor_class(vllm_config)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/executor_base.py", line 53, in __init__
ERROR 07-25 07:11:44 [core.py:586] self._init_executor()
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 48, in _init_executor
ERROR 07-25 07:11:44 [core.py:586] self.collective_rpc("load_model")
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
ERROR 07-25 07:11:44 [core.py:586] answer = run_method(self.driver_worker, method, args, kwargs)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/utils/__init__.py", line 2736, in run_method
ERROR 07-25 07:11:44 [core.py:586] return func(*args, **kwargs)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 240, in load_model
ERROR 07-25 07:11:44 [core.py:586] self.model_runner.load_model()
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1748, in load_model
ERROR 07-25 07:11:44 [core.py:586] self.model = get_model(vllm_config=self.vllm_config)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 59, in get_model
ERROR 07-25 07:11:44 [core.py:586] return loader.load_model(vllm_config=vllm_config,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model
ERROR 07-25 07:11:44 [core.py:586] model = initialize_model(vllm_config=vllm_config,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model
ERROR 07-25 07:11:44 [core.py:586] return model_class(vllm_config=vllm_config, prefix=prefix)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 448, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.model = Qwen2Model(vllm_config=vllm_config,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__
ERROR 07-25 07:11:44 [core.py:586] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 317, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.start_layer, self.end_layer, self.layers = make_layers(
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 639, in make_layers
ERROR 07-25 07:11:44 [core.py:586] [PPMissingLayer() for _ in range(start_layer)] + [
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 640, in <listcomp>
ERROR 07-25 07:11:44 [core.py:586] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 319, in <lambda>
ERROR 07-25 07:11:44 [core.py:586] lambda prefix: decoder_layer_type(config=config,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 216, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.self_attn = Qwen2Attention(
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 137, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.qkv_proj = QKVParallelLinear(
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 874, in __init__
ERROR 07-25 07:11:44 [core.py:586] super().__init__(input_size=input_size,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 420, in __init__
ERROR 07-25 07:11:44 [core.py:586] super().__init__(input_size,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 266, in __init__
ERROR 07-25 07:11:44 [core.py:586] self.quant_method = quant_config.get_quant_method(self,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 92, in get_quant_method
ERROR 07-25 07:11:44 [core.py:586] if self.is_layer_skipped_ascend(prefix,
ERROR 07-25 07:11:44 [core.py:586] File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 126, in is_layer_skipped_ascend
ERROR 07-25 07:11:44 [core.py:586] is_shard_skipped = self.quant_description[shard_prefix +
ERROR 07-25 07:11:44 [core.py:586] KeyError: 'model.layers.0.self_attn.q_proj.weight'
Process EngineCore_0:
Traceback (most recent call last):
File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/usr/local/python3.10.17/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 590, in run_engine_core
raise e
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 577, in run_engine_core
engine_core = EngineCoreProc(*args, **kwargs)
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 404, in __init__
super().__init__(vllm_config, executor_class, log_stats,
File "/vllm-workspace/vllm/vllm/v1/engine/core.py", line 75, in __init__
self.model_executor = executor_class(vllm_config)
File "/vllm-workspace/vllm/vllm/executor/executor_base.py", line 53, in __init__
self._init_executor()
File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 48, in _init_executor
self.collective_rpc("load_model")
File "/vllm-workspace/vllm/vllm/executor/uniproc_executor.py", line 57, in collective_rpc
answer = run_method(self.driver_worker, method, args, kwargs)
File "/vllm-workspace/vllm/vllm/utils/__init__.py", line 2736, in run_method
return func(*args, **kwargs)
File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/worker_v1.py", line 240, in load_model
self.model_runner.load_model()
File "/vllm-workspace/vllm-ascend/vllm_ascend/worker/model_runner_v1.py", line 1748, in load_model
self.model = get_model(vllm_config=self.vllm_config)
File "/vllm-workspace/vllm/vllm/model_executor/model_loader/__init__.py", line 59, in get_model
return loader.load_model(vllm_config=vllm_config,
File "/vllm-workspace/vllm/vllm/model_executor/model_loader/base_loader.py", line 38, in load_model
model = initialize_model(vllm_config=vllm_config,
File "/vllm-workspace/vllm/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model
return model_class(vllm_config=vllm_config, prefix=prefix)
File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 448, in __init__
self.model = Qwen2Model(vllm_config=vllm_config,
File "/vllm-workspace/vllm/vllm/compilation/decorators.py", line 152, in __init__
old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 317, in __init__
self.start_layer, self.end_layer, self.layers = make_layers(
File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 639, in make_layers
[PPMissingLayer() for _ in range(start_layer)] + [
File "/vllm-workspace/vllm/vllm/model_executor/models/utils.py", line 640, in <listcomp>
maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 319, in <lambda>
lambda prefix: decoder_layer_type(config=config,
File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 216, in __init__
self.self_attn = Qwen2Attention(
File "/vllm-workspace/vllm/vllm/model_executor/models/qwen2.py", line 137, in __init__
self.qkv_proj = QKVParallelLinear(
File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 874, in __init__
super().__init__(input_size=input_size,
File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 420, in __init__
super().__init__(input_size,
File "/vllm-workspace/vllm/vllm/model_executor/layers/linear.py", line 266, in __init__
self.quant_method = quant_config.get_quant_method(self,
File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 92, in get_quant_method
if self.is_layer_skipped_ascend(prefix,
File "/vllm-workspace/vllm-ascend/vllm_ascend/quantization/quant_config.py", line 126, in is_layer_skipped_ascend
is_shard_skipped = self.quant_description[shard_prefix +
KeyError: 'model.layers.0.self_attn.q_proj.weight'
Traceback (most recent call last):
File "/usr/local/python3.10.17/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/python3.10.17/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1495, in <module>
uvloop.run(run_server(args))
File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/__init__.py", line 82, in run
return loop.run_until_complete(wrapper())
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
File "/usr/local/python3.10.17/lib/python3.10/site-packages/uvloop/__init__.py", line 61, in wrapper
return await main
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1431, in run_server
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 1451, in run_server_worker
async with build_async_engine_client(args, client_config) as engine_client:
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in __aenter__
return await anext(self.gen)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 158, in build_async_engine_client
async with build_async_engine_client_from_engine_args(
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 199, in __aenter__
return await anext(self.gen)
File "/vllm-workspace/vllm/vllm/entrypoints/openai/api_server.py", line 194, in build_async_engine_client_from_engine_args
async_llm = AsyncLLM.from_vllm_config(
File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 162, in from_vllm_config
return cls(
File "/vllm-workspace/vllm/vllm/v1/engine/async_llm.py", line 124, in __init__
self.engine_core = EngineCoreClient.make_async_mp_client(
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 96, in make_async_mp_client
return AsyncMPClient(*client_args)
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 666, in __init__
super().__init__(
File "/vllm-workspace/vllm/vllm/v1/engine/core_client.py", line 403, in __init__
with launch_core_engines(vllm_config, executor_class,
File "/usr/local/python3.10.17/lib/python3.10/contextlib.py", line 142, in __exit__
next(self.gen)
File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 434, in launch_core_engines
wait_for_engine_startup(
File "/vllm-workspace/vllm/vllm/v1/engine/utils.py", line 484, in wait_for_engine_startup
raise RuntimeError("Engine core initialization failed. "
RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
[ERROR] 2025-07-25-07:11:52 (PID:1889, Device:-1, RankID:-1) ERR99999 UNKNOWN applicaiton exception
[root@e9a74ce1729c mas]# python -m vllm.entrypoints.openai.api_server --model /models/z50051264/summary/Qwen2.5-7B-nf4/ --max-num-seqs=256 --max-model-len=4096 --max-num-batched-tokens=4096 --tensor-parallel-size=1 --block-size=128 --host=0.0.0.0 --port=8080 --gpu-memory-utilization=0.9 --trust-remote-code --served-model-name=zzz --quantization bitsandbytes --load-format bitsandbytes
我使用bitsandbytes进行nf4量化,报错如上,请分析问题原因(我启动没有量化的版本是正常的)
最新发布