Proc.new , proc , lambda and "store values of local variables within the scope in which the block wa...

本文探讨了Ruby中Proc.new、proc与lambda的区别,特别是在不同Ruby版本中对参数数量检查的不同行为。通过四个实例展示了块如何引用外部变量,并强调了块创建时的变量作用域。
Proc.new, proc, lambda 用于从block创建对象.
Ruby 1.8 中 , proc, lambda 都需要检查参数个数. Proc.new 不检查参数个数.
Ruby 1.9 中 , lambda 需要检查参数个数. Proc.new和proc 不检查参数个数.
block中调用的本地变量, 取自创建这个块时所在的本地变量, 而不是调用块时的本地变量.
下面都是Ruby 1.9 中的举例,
 举例一 :
x = "hello world"
ablock = lambda{ |y| puts(x) }  
# 如果上面这行改成 ablock = lambda{ puts(x) }, 调用ablock都将报错, wrong number of arguments (1 for 0 (ArgumentError)
def aMethod(ablockArg)
  x = "goodbye"
  ablockArg.call(x)
end

puts(x) #=> "hello world"
ablock.call(x) #=> "hello world"
aMethod(ablock) #=> "hello world"
ablock.call(x) #=> "hello world"
puts(x) #=> "hello world"
输出都是 "hello world" 没有一个是goodbye, 原因是 " block中调用的本地变量, 取自创建这个块时所在的本地变量, 而不是调用块时的本地变量. "


 举例二 : 
x = "hello world"
ablock = proc{ puts(x) }
# proc 在1.9版本中不用检查参数个数, 所以不会报错
def aMethod(ablockArg)
  x = "goodbye"
  ablockArg.call(x)
end

puts(x) #=> "hello world"
ablock.call(x) #=> "hello world"
aMethod(ablock) #=> "hello world"
ablock.call(x) #=> "hello world"
puts(x) #=> "hello world"


 举例三 :
x = "hello world"
ablock = Proc.new{ puts(x) }
Proc.new创建的块对象在调用是也不用检查传入的参数个数.
def aMethod(ablockArg)
  x = "goodbye"
  ablockArg.call(x)
end

puts(x) #=> "hello world"
ablock.call(x) #=> "hello world"
aMethod(ablock) #=> "hello world"
ablock.call(x) #=> "hello world"
puts(x) #=> "hello world"


 举例四 : 

严格来说, 上面的代码应该改成如下 :
x = "hello world"
ablock = lambda{ || puts(x) } # 表示不需要传入参数
# 或者 ablock = proc{ || puts(x) }
# 或者 ablock = Proc.new{ || puts(x) }
def aMethod(ablockArg)
  x = "goodbye"
  ablockArg.call()
end

puts(x) #=> "hello world"
ablock.call() #=> "hello world" # 调用时没有传入参数, 所以检查参数个数时是正确的.
aMethod(ablock) #=> "hello world"
ablock.call() #=> "hello world"
puts(x) #=> "hello world"


INFO 09-12 08:12:49 __init__.py:207] Automatically detected platform cuda. INFO 09-12 08:12:49 api_server.py:912] vLLM API server version 0.7.3 INFO 09-12 08:12:49 api_server.py:913] args: Namespace(host=None, port=8003, uvicorn_log_level='info', allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, enable_reasoning=False, reasoning_parser=None, tool_call_parser=None, tool_parser_plugin='', model='/models/DeepSeek-R1-Distill-Llama-70B', task='auto', tokenizer=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=84320, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=0, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.95, num_gpu_blocks_override=None, max_num_batched_tokens=512, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=1, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=True, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_model=None, speculative_model_quantization=None, num_speculative_tokens=None, speculative_disable_mqa_scorer=False, speculative_draft_tensor_parallel_size=None, speculative_max_model_len=None, speculative_disable_by_batch_size=None, ngram_prompt_lookup_max=None, ngram_prompt_lookup_min=None, spec_decoding_acceptance_method='rejection_sampler', typical_acceptance_sampler_posterior_threshold=None, typical_acceptance_sampler_posterior_alpha=None, disable_logprobs_during_spec_decoding=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=['DeepSeek-R1'], qlora_adapter_name_or_path=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', generation_config=None, override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False) INFO 09-12 08:12:49 api_server.py:209] Started engine process with PID 76 INFO 09-12 08:12:53 __init__.py:207] Automatically detected platform cuda. INFO 09-12 08:12:55 config.py:549] This model supports multiple tasks: {'generate', 'score', 'reward', 'embed', 'classify'}. Defaulting to 'generate'. WARNING 09-12 08:12:55 arg_utils.py:1187] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False. INFO 09-12 08:12:55 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=512. WARNING 09-12 08:12:55 cuda.py:95] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used WARNING 09-12 08:12:55 config.py:685] Async output processing is not supported on the current platform type cuda. INFO 09-12 08:12:58 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'score', 'classify', 'reward'}. Defaulting to 'generate'. WARNING 09-12 08:12:58 arg_utils.py:1187] Chunked prefill is enabled by default for models with max_model_len > 32K. Currently, chunked prefill might not work with some features or models. If you encounter any issues, please disable chunked prefill by setting --enable-chunked-prefill=False. INFO 09-12 08:12:58 config.py:1555] Chunked prefill is enabled with max_num_batched_tokens=512. WARNING 09-12 08:12:58 cuda.py:95] To see benefits of async output processing, enable CUDA graph. Since, enforce-eager is enabled, async output processor cannot be used WARNING 09-12 08:12:58 config.py:685] Async output processing is not supported on the current platform type cuda. INFO 09-12 08:12:58 llm_engine.py:234] Initializing a V0 LLM engine (v0.7.3) with config: model='/models/DeepSeek-R1-Distill-Llama-70B', speculative_config=None, tokenizer='/models/DeepSeek-R1-Distill-Llama-70B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=84320, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=DeepSeek-R1, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=False, chunked_prefill_enabled=True, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[],"max_capture_size":0}, use_cached_outputs=True, INFO 09-12 08:12:59 cuda.py:229] Using Flash Attention backend. INFO 09-12 08:13:00 model_runner.py:1110] Starting to load model /models/DeepSeek-R1-Distill-Llama-70B... ERROR 09-12 08:13:00 engine.py:400] CUDA out of memory. Tried to allocate 896.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 214.81 MiB is free. Process 20061 has 23.43 GiB memory in use. Of the allocated memory 22.99 GiB is allocated by PyTorch, and 1.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) ERROR 09-12 08:13:00 engine.py:400] Traceback (most recent call last): Process SpawnProcess-1: ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine ERROR 09-12 08:13:00 engine.py:400] engine = MQLLMEngine.from_engine_args(engine_args=engine_args, ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args ERROR 09-12 08:13:00 engine.py:400] return cls(ipc_path=ipc_path, ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.engine = LLMEngine(*args, **kwargs) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.model_executor = executor_class(vllm_config=vllm_config, ) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in __init__ ERROR 09-12 08:13:00 engine.py:400] self._init_executor() ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor ERROR 09-12 08:13:00 engine.py:400] self.collective_rpc("load_model") ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc ERROR 09-12 08:13:00 engine.py:400] answer = run_method(self.driver_worker, method, args, kwargs) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method ERROR 09-12 08:13:00 engine.py:400] return func(*args, **kwargs) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model ERROR 09-12 08:13:00 engine.py:400] self.model_runner.load_model() ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model ERROR 09-12 08:13:00 engine.py:400] self.model = get_model(vllm_config=self.vllm_config) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model ERROR 09-12 08:13:00 engine.py:400] return loader.load_model(vllm_config=vllm_config) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 406, in load_model ERROR 09-12 08:13:00 engine.py:400] model = _initialize_model(vllm_config=vllm_config) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model ERROR 09-12 08:13:00 engine.py:400] return model_class(vllm_config=vllm_config, prefix=prefix) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 496, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.model = self._init_model(vllm_config=vllm_config, ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 533, in _init_model ERROR 09-12 08:13:00 engine.py:400] return LlamaModel(vllm_config=vllm_config, prefix=prefix) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 151, in __init__ ERROR 09-12 08:13:00 engine.py:400] old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 326, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.start_layer, self.end_layer, self.layers = make_layers( ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 558, in make_layers ERROR 09-12 08:13:00 engine.py:400] maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 328, in <lambda> ERROR 09-12 08:13:00 engine.py:400] lambda prefix: layer_type(config=config, ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 254, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.mlp = LlamaMLP( ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 70, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.gate_up_proj = MergedColumnParallelLinear( Traceback (most recent call last): ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 441, in __init__ ERROR 09-12 08:13:00 engine.py:400] super().__init__(input_size=input_size, ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 314, in __init__ ERROR 09-12 08:13:00 engine.py:400] self.quant_method.create_weights( ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 129, in create_weights ERROR 09-12 08:13:00 engine.py:400] weight = Parameter(torch.empty(sum(output_partition_sizes), ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_device.py", line 106, in __torch_function__ ERROR 09-12 08:13:00 engine.py:400] return func(*args, **kwargs) ERROR 09-12 08:13:00 engine.py:400] ^^^^^^^^^^^^^^^^^^^^^ ERROR 09-12 08:13:00 engine.py:400] torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 214.81 MiB is free. Process 20061 has 23.43 GiB memory in use. Of the allocated memory 22.99 GiB is allocated by PyTorch, and 1.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap self.run() File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 402, in run_mp_engine raise e File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 391, in run_mp_engine engine = MQLLMEngine.from_engine_args(engine_args=engine_args, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 124, in from_engine_args return cls(ipc_path=ipc_path, ^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/engine/multiprocessing/engine.py", line 76, in __init__ self.engine = LLMEngine(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 273, in __init__ self.model_executor = executor_class(vllm_config=vllm_config, ) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 52, in __init__ self._init_executor() File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 47, in _init_executor self.collective_rpc("load_model") File "/usr/local/lib/python3.12/dist-packages/vllm/executor/uniproc_executor.py", line 56, in collective_rpc answer = run_method(self.driver_worker, method, args, kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2196, in run_method return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 183, in load_model self.model_runner.load_model() File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1112, in load_model self.model = get_model(vllm_config=self.vllm_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/__init__.py", line 14, in get_model return loader.load_model(vllm_config=vllm_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 406, in load_model model = _initialize_model(vllm_config=vllm_config) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/loader.py", line 125, in _initialize_model return model_class(vllm_config=vllm_config, prefix=prefix) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 496, in __init__ self.model = self._init_model(vllm_config=vllm_config, ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 533, in _init_model return LlamaModel(vllm_config=vllm_config, prefix=prefix) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 151, in __init__ old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs) File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 326, in __init__ self.start_layer, self.end_layer, self.layers = make_layers( ^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/utils.py", line 558, in make_layers maybe_offload_to_cpu(layer_fn(prefix=f"{prefix}.{idx}")) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 328, in <lambda> lambda prefix: layer_type(config=config, ^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 254, in __init__ self.mlp = LlamaMLP( ^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py", line 70, in __init__ self.gate_up_proj = MergedColumnParallelLinear( ^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 441, in __init__ super().__init__(input_size=input_size, File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 314, in __init__ self.quant_method.create_weights( File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py", line 129, in create_weights weight = Parameter(torch.empty(sum(output_partition_sizes), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/torch/utils/_device.py", line 106, in __torch_function__ return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 896.00 MiB. GPU 0 has a total capacity of 23.64 GiB of which 214.81 MiB is free. Process 20061 has 23.43 GiB memory in use. Of the allocated memory 22.99 GiB is allocated by PyTorch, and 1.57 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) [rank0]:[W912 08:13:00.032028245 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator()) Traceback (most recent call last): File "<frozen runpy>", line 198, in _run_module_as_main File "<frozen runpy>", line 88, in _run_code File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 991, in <module> uvloop.run(run_server(args)) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run return __asyncio.run( ^^^^^^^^^^^^^^ File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run return runner.run(main) ^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run return self._loop.run_until_complete(task) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper return await main ^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 947, in run_server async with build_async_engine_client(args) as engine_client: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ return await anext(self.gen) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 139, in build_async_engine_client async with build_async_engine_client_from_engine_args( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ return await anext(self.gen) ^^^^^^^^^^^^^^^^^^^^^ File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 233, in build_async_engine_client_from_engine_args raise RuntimeError( RuntimeError: Engine process failed to start. See stack trace for the root cause. ubuntu docker部署DeepSeek-R1-Distill-Llama-70B 模型 报错
最新发布
09-13
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值