```
[root@80101f8eab9f mas]# python -m vllm.entrypoints.openai.api_server \
> --model /models/z50051264/medusa-1.0-zephyr-7b-beta \
> --engine-args='{"speculative_model": "/models/z50051264/medusa-1.0-zephyr-7b-beta"}' \
> --max-num-seqs=256 \
> --max-model-len=4096 \
> --max-num-batched-tokens=4096 \
> --tensor-parallel-size=1 \
> --block-size=128 \
> --host=0.0.0.0 \
> --port=8080 \
> --gpu-memory-utilization=0.9 \
> --trust-remote-code \
> --served-model-name=zzz
INFO 08-06 07:26:00 [__init__.py:39] Available plugins for group vllm.platform_plugins:
INFO 08-06 07:26:00 [__init__.py:41] - ascend -> vllm_ascend:register
INFO 08-06 07:26:00 [__init__.py:44] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load.
INFO 08-06 07:26:00 [__init__.py:235] Platform plugin ascend is activated
WARNING 08-06 07:26:01 [_custom_ops.py:20] Failed to import from vllm._C with ModuleNotFoundError("No module named 'vllm._C'")
INFO 08-06 07:26:04 [importing.py:63] Triton not installed or not compatible; certain GPU-related functions will not be available.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture DeepSeekMTPModel is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_mtp:CustomDeepSeekMTP.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture Qwen2VLForConditionalGeneration is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen2_vl:AscendQwen2VLForConditionalGeneration.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture Qwen2_5_VLForConditionalGeneration is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen2_5_vl:AscendQwen2_5_VLForConditionalGeneration.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture DeepseekV2ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_v2:CustomDeepseekV2ForCausalLM.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture DeepseekV3ForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.deepseek_v2:CustomDeepseekV3ForCausalLM.
WARNING 08-06 07:26:05 [registry.py:413] Model architecture Qwen3MoeForCausalLM is already registered, and will be overwritten by the new model class vllm_ascend.models.qwen3_moe:CustomQwen3MoeForCausalLM.
usage: api_server.py [-h] [--host HOST] [--port PORT] [--uvicorn-log-level {debug,info,warning,error,critical,trace}]
[--disable-uvicorn-access-log] [--allow-credentials] [--allowed-origins ALLOWED_ORIGINS]
[--allowed-methods ALLOWED_METHODS] [--allowed-headers ALLOWED_HEADERS] [--api-key API_KEY]
[--lora-modules LORA_MODULES [LORA_MODULES ...]]
[--prompt-adapters PROMPT_ADAPTERS [PROMPT_ADAPTERS ...]] [--chat-template CHAT_TEMPLATE]
[--chat-template-content-format {auto,string,openai}] [--response-role RESPONSE_ROLE]
[--ssl-keyfile SSL_KEYFILE] [--ssl-certfile SSL_CERTFILE] [--ssl-ca-certs SSL_CA_CERTS]
[--enable-ssl-refresh] [--ssl-cert-reqs SSL_CERT_REQS] [--root-path ROOT_PATH]
[--middleware MIDDLEWARE] [--return-tokens-as-token-ids] [--disable-frontend-multiprocessing]
[--enable-request-id-headers] [--enable-auto-tool-choice]
[--expand-tools-even-if-tool-choice-none]
[--tool-call-parser {deepseek_v3,granite-20b-fc,granite,hermes,internlm,jamba,llama4_pythonic,llama4_json,llama3_json,minimax,mistral,phi4_mini_json,pythonic,xlam} or name registered in --tool-parser-plugin]
[--tool-parser-plugin TOOL_PARSER_PLUGIN] [--log-config-file LOG_CONFIG_FILE] [--model MODEL]
[--task {auto,classify,draft,embed,embedding,generate,reward,score,transcription}]
[--tokenizer TOKENIZER] [--tokenizer-mode {auto,custom,mistral,slow}]
[--trust-remote-code | --no-trust-remote-code]
[--dtype {auto,bfloat16,float,float16,float32,half}] [--seed SEED]
[--hf-config-path HF_CONFIG_PATH] [--allowed-local-media-path ALLOWED_LOCAL_MEDIA_PATH]
[--revision REVISION] [--code-revision CODE_REVISION] [--rope-scaling ROPE_SCALING]
[--rope-theta ROPE_THETA] [--tokenizer-revision TOKENIZER_REVISION]
[--max-model-len MAX_MODEL_LEN] [--quantization QUANTIZATION]
[--enforce-eager | --no-enforce-eager] [--max-seq-len-to-capture MAX_SEQ_LEN_TO_CAPTURE]
[--max-logprobs MAX_LOGPROBS] [--disable-sliding-window | --no-disable-sliding-window]
[--disable-cascade-attn | --no-disable-cascade-attn]
[--skip-tokenizer-init | --no-skip-tokenizer-init]
[--enable-prompt-embeds | --no-enable-prompt-embeds]
[--served-model-name SERVED_MODEL_NAME [SERVED_MODEL_NAME ...]] [--disable-async-output-proc]
[--config-format {auto,hf,mistral}] [--hf-token [HF_TOKEN]] [--hf-overrides HF_OVERRIDES]
[--override-neuron-config OVERRIDE_NEURON_CONFIG]
[--override-pooler-config OVERRIDE_POOLER_CONFIG]
[--logits-processor-pattern LOGITS_PROCESSOR_PATTERN] [--generation-config GENERATION_CONFIG]
[--override-generation-config OVERRIDE_GENERATION_CONFIG]
[--enable-sleep-mode | --no-enable-sleep-mode] [--model-impl {auto,vllm,transformers}]
[--override-attention-dtype OVERRIDE_ATTENTION_DTYPE]
[--load-format {auto,pt,safetensors,npcache,dummy,tensorizer,sharded_state,gguf,bitsandbytes,mistral,runai_streamer,runai_streamer_sharded,fastsafetensors}]
[--download-dir DOWNLOAD_DIR] [--model-loader-extra-config MODEL_LOADER_EXTRA_CONFIG]
[--ignore-patterns IGNORE_PATTERNS [IGNORE_PATTERNS ...]]
[--use-tqdm-on-load | --no-use-tqdm-on-load]
[--qlora-adapter-name-or-path QLORA_ADAPTER_NAME_OR_PATH]
[--pt-load-map-location PT_LOAD_MAP_LOCATION]
[--guided-decoding-backend {auto,guidance,lm-format-enforcer,outlines,xgrammar}]
[--guided-decoding-disable-fallback | --no-guided-decoding-disable-fallback]
[--guided-decoding-disable-any-whitespace | --no-guided-decoding-disable-any-whitespace]
[--guided-decoding-disable-additional-properties | --no-guided-decoding-disable-additional-properties]
[--enable-reasoning | --no-enable-reasoning] [--reasoning-parser {deepseek_r1,granite,qwen3}]
[--distributed-executor-backend {external_launcher,mp,ray,uni,None}]
[--pipeline-parallel-size PIPELINE_PARALLEL_SIZE] [--tensor-parallel-size TENSOR_PARALLEL_SIZE]
[--data-parallel-size DATA_PARALLEL_SIZE] [--data-parallel-rank DATA_PARALLEL_RANK]
[--data-parallel-size-local DATA_PARALLEL_SIZE_LOCAL]
[--data-parallel-address DATA_PARALLEL_ADDRESS]
[--data-parallel-rpc-port DATA_PARALLEL_RPC_PORT]
[--data-parallel-backend DATA_PARALLEL_BACKEND]
[--enable-expert-parallel | --no-enable-expert-parallel] [--enable-eplb | --no-enable-eplb]
[--num-redundant-experts NUM_REDUNDANT_EXPERTS] [--eplb-window-size EPLB_WINDOW_SIZE]
[--eplb-step-interval EPLB_STEP_INTERVAL] [--eplb-log-balancedness | --no-eplb-log-balancedness]
[--max-parallel-loading-workers MAX_PARALLEL_LOADING_WORKERS]
[--ray-workers-use-nsight | --no-ray-workers-use-nsight]
[--disable-custom-all-reduce | --no-disable-custom-all-reduce] [--worker-cls WORKER_CLS]
[--worker-extension-cls WORKER_EXTENSION_CLS]
[--enable-multimodal-encoder-data-parallel | --no-enable-multimodal-encoder-data-parallel]
[--block-size {1,8,16,32,64,128}] [--gpu-memory-utilization GPU_MEMORY_UTILIZATION]
[--swap-space SWAP_SPACE] [--kv-cache-dtype {auto,fp8,fp8_e4m3,fp8_e5m2}]
[--num-gpu-blocks-override NUM_GPU_BLOCKS_OVERRIDE]
[--enable-prefix-caching | --no-enable-prefix-caching]
[--prefix-caching-hash-algo {builtin,sha256}] [--cpu-offload-gb CPU_OFFLOAD_GB]
[--calculate-kv-scales | --no-calculate-kv-scales] [--tokenizer-pool-size TOKENIZER_POOL_SIZE]
[--tokenizer-pool-type TOKENIZER_POOL_TYPE]
[--tokenizer-pool-extra-config TOKENIZER_POOL_EXTRA_CONFIG]
[--limit-mm-per-prompt LIMIT_MM_PER_PROMPT] [--media-io-kwargs MEDIA_IO_KWARGS]
[--mm-processor-kwargs MM_PROCESSOR_KWARGS]
[--disable-mm-preprocessor-cache | --no-disable-mm-preprocessor-cache]
[--enable-lora | --no-enable-lora] [--enable-lora-bias | --no-enable-lora-bias]
[--max-loras MAX_LORAS] [--max-lora-rank MAX_LORA_RANK]
[--lora-extra-vocab-size LORA_EXTRA_VOCAB_SIZE] [--lora-dtype {auto,bfloat16,float16}]
[--long-lora-scaling-factors LONG_LORA_SCALING_FACTORS [LONG_LORA_SCALING_FACTORS ...]]
[--max-cpu-loras MAX_CPU_LORAS] [--fully-sharded-loras | --no-fully-sharded-loras]
[--enable-prompt-adapter | --no-enable-prompt-adapter]
[--max-prompt-adapters MAX_PROMPT_ADAPTERS]
[--max-prompt-adapter-token MAX_PROMPT_ADAPTER_TOKEN]
[--device {auto,cpu,cuda,hpu,neuron,tpu,xpu,None}] [--speculative-config SPECULATIVE_CONFIG]
[--show-hidden-metrics-for-version SHOW_HIDDEN_METRICS_FOR_VERSION]
[--otlp-traces-endpoint OTLP_TRACES_ENDPOINT]
[--collect-detailed-traces {all,model,worker,None} [{all,model,worker,None} ...]]
[--max-num-batched-tokens MAX_NUM_BATCHED_TOKENS] [--max-num-seqs MAX_NUM_SEQS]
[--max-num-partial-prefills MAX_NUM_PARTIAL_PREFILLS]
[--max-long-partial-prefills MAX_LONG_PARTIAL_PREFILLS]
[--cuda-graph-sizes CUDA_GRAPH_SIZES [CUDA_GRAPH_SIZES ...]]
[--long-prefill-token-threshold LONG_PREFILL_TOKEN_THRESHOLD]
[--num-lookahead-slots NUM_LOOKAHEAD_SLOTS] [--scheduler-delay-factor SCHEDULER_DELAY_FACTOR]
[--preemption-mode {recompute,swap,None}] [--num-scheduler-steps NUM_SCHEDULER_STEPS]
[--multi-step-stream-outputs | --no-multi-step-stream-outputs]
[--scheduling-policy {fcfs,priority}] [--enable-chunked-prefill | --no-enable-chunked-prefill]
[--disable-chunked-mm-input | --no-disable-chunked-mm-input] [--scheduler-cls SCHEDULER_CLS]
[--disable-hybrid-kv-cache-manager | --no-disable-hybrid-kv-cache-manager]
[--kv-transfer-config KV_TRANSFER_CONFIG] [--kv-events-config KV_EVENTS_CONFIG]
[--compilation-config COMPILATION_CONFIG] [--additional-config ADDITIONAL_CONFIG]
[--use-v2-block-manager] [--disable-log-stats] [--disable-log-requests]
[--max-log-len MAX_LOG_LEN] [--disable-fastapi-docs] [--enable-prompt-tokens-details]
[--enable-force-include-usage] [--enable-server-load-tracking]
api_server.py: error: unrecognized arguments: --engine-args={"speculative_model": "/models/z50051264/medusa-1.0-zephyr-7b-beta"}
```
请分析并给出正确指令。
下面是我用的命令: