(vllm) wen@DESKTOP-3H5GS3M:~$ CUDA_DEVICE_ORDER=PCI_BUS_ID PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True CUDA_VISIBLE_DEVICES=1 vllm serve /home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit --host 0.0.0.0 --port 8000 --gpu-memory-utilization 0.8 --swap-space 16 --max-num-seqs 16
INFO 09-28 20:05:32 [__init__.py:216] Automatically detected platform cuda.
(APIServer pid=1787797) INFO 09-28 20:05:34 [api_server.py:1896] vLLM API server version 0.10.2
(APIServer pid=1787797) INFO 09-28 20:05:34 [utils.py:328] non-default args: {'model_tag': '/home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit', 'host': '0.0.0.0', 'model': '/home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit', 'gpu_memory_utilization': 0.8, 'swap_space': 16.0, 'max_num_seqs': 16}
(APIServer pid=1787797) INFO 09-28 20:05:39 [__init__.py:742] Resolved architecture: Qwen3NextForCausalLM
(APIServer pid=1787797) `torch_dtype` is deprecated! Use `dtype` instead!
(APIServer pid=1787797) INFO 09-28 20:05:39 [__init__.py:1815] Using max model len 262144
(APIServer pid=1787797) WARNING 09-28 20:05:39 [_ipex_ops.py:16] Import error msg: No module named 'intel_extension_for_pytorch'
(APIServer pid=1787797) INFO 09-28 20:05:39 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=2048.
(APIServer pid=1787797) INFO 09-28 20:05:39 [config.py:310] Hybrid or mamba-based model detected: disabling prefix caching since it is not yet supported.
(APIServer pid=1787797) INFO 09-28 20:05:39 [config.py:321] Hybrid or mamba-based model detected: setting cudagraph mode to FULL_AND_PIECEWISE in order to optimize performance.
(APIServer pid=1787797) INFO 09-28 20:05:39 [config.py:390] Setting attention block size to 544 tokens to ensure that attention page size is >= mamba page size.
(APIServer pid=1787797) INFO 09-28 20:05:39 [config.py:411] Padding mamba page size by 1.49% to ensure that mamba page size and attention page size are exactly equal.
(APIServer pid=1787797) WARNING 09-28 20:05:39 [cache.py:214] Possibly too large swap space. 16.00 GiB out of the 31.31 GiB total CPU memory is allocated for the swap space.
INFO 09-28 20:05:42 [__init__.py:216] Automatically detected platform cuda.
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:44 [core.py:654] Waiting for init message from front-end.
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:44 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='/home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit', speculative_config=None, tokenizer='/home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=262144, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=/home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=True, pooler_config=None, compilation_config={"level":3,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":["vllm.unified_attention","vllm.unified_attention_with_output","vllm.mamba_mixer2","vllm.mamba_mixer","vllm.short_conv","vllm.linear_attention","vllm.plamo2_mamba_mixer","vllm.gdn_attention"],"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":[2,1],"use_cudagraph":true,"cudagraph_num_of_warmups":1,"cudagraph_capture_sizes":[32,24,16,8,4,2,1],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"pass_config":{},"max_capture_size":32,"local_cache_dir":null}
(EngineCore_DP0 pid=1787882) WARNING 09-28 20:05:44 [interface.py:391] Using 'pin_memory=False' as WSL is detected. This may slow down the performance.
[W928 20:05:45.576880075 ProcessGroupNCCL.cpp:981] Warning: TORCH_NCCL_AVOID_RECORD_STREAMS is the default now, this environment variable is thus deprecated. (function operator())
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:45 [parallel_state.py:1165] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
(EngineCore_DP0 pid=1787882) WARNING 09-28 20:05:45 [topk_topp_sampler.py:69] FlashInfer is not available. Falling back to the PyTorch-native implementation of top-p & top-k sampling. For the best performance, please install FlashInfer.
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:45 [gpu_model_runner.py:2338] Starting to load model /home/wen/models/Qwen/Qwen3-Next-80B-A3B-Instruct-AWQ-4bit...
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:46 [gpu_model_runner.py:2370] Loading model from scratch...
(EngineCore_DP0 pid=1787882) `torch_dtype` is deprecated! Use `dtype` instead!
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:46 [compressed_tensors_moe.py:121] Using CompressedTensorsWNA16MarlinMoEMethod
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:46 [compressed_tensors_wNa16.py:95] Using MarlinLinearKernel for CompressedTensorsWNA16
(EngineCore_DP0 pid=1787882) INFO 09-28 20:05:46 [cuda.py:362] Using Flash Attention backend on V1 engine.
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] EngineCore failed to start.
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] Traceback (most recent call last):
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 505, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 82, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self._init_executor()
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 49, in _init_executor
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.collective_rpc("load_model")
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] answer = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/utils/__init__.py", line 3060, in run_method
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] return func(*args, **kwargs)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 213, in load_model
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.model_runner.load_model(eep_scale_up=eep_scale_up)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2371, in load_model
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.model = model_loader.load_model(
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/base_loader.py", line 45, in load_model
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] model = initialize_model(vllm_config=vllm_config,
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 1079, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.model = Qwen3NextModel(vllm_config=vllm_config,
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 199, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 915, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 642, in make_layers
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] [PPMissingLayer() for _ in range(start_layer)] + [
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 643, in <listcomp>
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 904, in get_layer
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] return Qwen3NextDecoderLayer(
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 782, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.mlp = Qwen3NextSparseMoeBlock(
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 115, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.experts = FusedMoE(num_experts=self.n_routed_experts,
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/layers/fused_moe/layer.py", line 945, in __init__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] self.quant_method.create_weights(layer=self, **moe_quant_params)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 1167, in create_weights
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] w2_weight = torch.nn.Parameter(torch.empty(
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/torch/utils/_device.py", line 103, in __torch_function__
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] return func(*args, **kwargs)
(EngineCore_DP0 pid=1787882) ERROR 09-28 20:05:47 [core.py:718] RuntimeError: CUDA driver error: out of memory
(EngineCore_DP0 pid=1787882) Process EngineCore_DP0:
(EngineCore_DP0 pid=1787882) Traceback (most recent call last):
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
(EngineCore_DP0 pid=1787882) self.run()
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/multiprocessing/process.py", line 108, in run
(EngineCore_DP0 pid=1787882) self._target(*self._args, **self._kwargs)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 722, in run_engine_core
(EngineCore_DP0 pid=1787882) raise e
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 709, in run_engine_core
(EngineCore_DP0 pid=1787882) engine_core = EngineCoreProc(*args, **kwargs)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 505, in __init__
(EngineCore_DP0 pid=1787882) super().__init__(vllm_config, executor_class, log_stats,
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 82, in __init__
(EngineCore_DP0 pid=1787882) self.model_executor = executor_class(vllm_config)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/executor_base.py", line 54, in __init__
(EngineCore_DP0 pid=1787882) self._init_executor()
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 49, in _init_executor
(EngineCore_DP0 pid=1787882) self.collective_rpc("load_model")
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/executor/uniproc_executor.py", line 58, in collective_rpc
(EngineCore_DP0 pid=1787882) answer = run_method(self.driver_worker, method, args, kwargs)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/utils/__init__.py", line 3060, in run_method
(EngineCore_DP0 pid=1787882) return func(*args, **kwargs)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/worker/gpu_worker.py", line 213, in load_model
(EngineCore_DP0 pid=1787882) self.model_runner.load_model(eep_scale_up=eep_scale_up)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2371, in load_model
(EngineCore_DP0 pid=1787882) self.model = model_loader.load_model(
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/base_loader.py", line 45, in load_model
(EngineCore_DP0 pid=1787882) model = initialize_model(vllm_config=vllm_config,
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/model_loader/utils.py", line 64, in initialize_model
(EngineCore_DP0 pid=1787882) return model_class(vllm_config=vllm_config, prefix=prefix)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 1079, in __init__
(EngineCore_DP0 pid=1787882) self.model = Qwen3NextModel(vllm_config=vllm_config,
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/compilation/decorators.py", line 199, in __init__
(EngineCore_DP0 pid=1787882) old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 915, in __init__
(EngineCore_DP0 pid=1787882) self.start_layer, self.end_layer, self.layers = make_layers(
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 642, in make_layers
(EngineCore_DP0 pid=1787882) [PPMissingLayer() for _ in range(start_layer)] + [
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/utils.py", line 643, in <listcomp>
(EngineCore_DP0 pid=1787882) maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}"))
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 904, in get_layer
(EngineCore_DP0 pid=1787882) return Qwen3NextDecoderLayer(
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 782, in __init__
(EngineCore_DP0 pid=1787882) self.mlp = Qwen3NextSparseMoeBlock(
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_next.py", line 115, in __init__
(EngineCore_DP0 pid=1787882) self.experts = FusedMoE(num_experts=self.n_routed_experts,
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/layers/fused_moe/layer.py", line 945, in __init__
(EngineCore_DP0 pid=1787882) self.quant_method.create_weights(layer=self, **moe_quant_params)
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py", line 1167, in create_weights
(EngineCore_DP0 pid=1787882) w2_weight = torch.nn.Parameter(torch.empty(
(EngineCore_DP0 pid=1787882) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/torch/utils/_device.py", line 103, in __torch_function__
(EngineCore_DP0 pid=1787882) return func(*args, **kwargs)
(EngineCore_DP0 pid=1787882) RuntimeError: CUDA driver error: out of memory
[rank0]:[W928 20:05:47.656617536 ProcessGroupNCCL.cpp:1538] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
(APIServer pid=1787797) Traceback (most recent call last):
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/bin/vllm", line 7, in <module>
(APIServer pid=1787797) sys.exit(main())
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/cli/main.py", line 54, in main
(APIServer pid=1787797) args.dispatch_function(args)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/cli/serve.py", line 50, in cmd
(APIServer pid=1787797) uvloop.run(run_server(args))
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/uvloop/__init__.py", line 82, in run
(APIServer pid=1787797) return loop.run_until_complete(wrapper())
(APIServer pid=1787797) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/uvloop/__init__.py", line 61, in wrapper
(APIServer pid=1787797) return await main
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 1941, in run_server
(APIServer pid=1787797) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 1961, in run_server_worker
(APIServer pid=1787797) async with build_async_engine_client(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/contextlib.py", line 199, in __aenter__
(APIServer pid=1787797) return await anext(self.gen)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 179, in build_async_engine_client
(APIServer pid=1787797) async with build_async_engine_client_from_engine_args(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/contextlib.py", line 199, in __aenter__
(APIServer pid=1787797) return await anext(self.gen)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/entrypoints/openai/api_server.py", line 221, in build_async_engine_client_from_engine_args
(APIServer pid=1787797) async_llm = AsyncLLM.from_vllm_config(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/utils/__init__.py", line 1589, in inner
(APIServer pid=1787797) return fn(*args, **kwargs)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 212, in from_vllm_config
(APIServer pid=1787797) return cls(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/async_llm.py", line 136, in __init__
(APIServer pid=1787797) self.engine_core = EngineCoreClient.make_async_mp_client(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 102, in make_async_mp_client
(APIServer pid=1787797) return AsyncMPClient(*client_args)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 769, in __init__
(APIServer pid=1787797) super().__init__(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/core_client.py", line 448, in __init__
(APIServer pid=1787797) with launch_core_engines(vllm_config, executor_class,
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/contextlib.py", line 142, in __exit__
(APIServer pid=1787797) next(self.gen)
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 729, in launch_core_engines
(APIServer pid=1787797) wait_for_engine_startup(
(APIServer pid=1787797) File "/home/wen/miniconda3/envs/vllm/lib/python3.10/site-packages/vllm/v1/engine/utils.py", line 782, in wait_for_engine_startup
(APIServer pid=1787797) raise RuntimeError("Engine core initialization failed. "
(APIServer pid=1787797) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}
最新发布