Refs
https://blog.vllm.ai/2024/09/05/perf-update.html
https://blog.vllm.ai/2025/01/27/v1-alpha-release.html
Scheduler的创建
EngineCore的核心成员,重要依赖的参数是kv cache的cache_config num_gpu_blocks和num_cpu_blocks。
Scheduler核心功能是调度在一起计算的request token,以及分配KV cache block.
Scheduler的输入
EngineCore.add_request最终调用Scheduler的add_request把请求添加到Schedulerwaiting列队:
class EngineCore:
def add_request(self, request: EngineCoreRequest):
"""Add request to the scheduler."""
req = Request.from_engine_core_request(request)
self.scheduler.add_request(req)
class Scheduler:
def __init__()
# req_id -> Request
self.requests: Dict[str, Request] = {}
# Priority queues for requests.
self.waiting: Deque[Request] = deque()
self.running: List[Request] = []
def add_request(self, request: Request) -> None:
self.waiting.append(request)
self.requests[request.request_id] = request
引擎调用Scheduler计算逻辑:
class EngineCore:
def step(self) -> EngineCoreOutputs:
"""Schedule, execute, and make output."""
scheduler_output = self.scheduler.schedule()
output = self.model_executor.execute_model(scheduler_output)
engine_core_outputs = self.scheduler.update_from_output(scheduler_output, output)
return engine_core_outputs
Scheduler.schedule
schedule基本流程如下图所示。注意首先add_request会把新的request添加到waiting列队,最初running list为空。
schedule首先去把running list的requst拿出来,根据其要计算的num_new_tokens数量(正确情况下为1)分配kv cache block:
request = self.running[req_index]
num_new_tokens = request.num_tokens - request.num_computed_tokens
new_blocks = self.kv_cache_manager.allocate_slots(request, num_new_tokens)
# if new_blocks is None, Preempt the lowest-priority request from running to waiting
如果分配失败(kv cache不足等原因),那么就采用preempt方式,出栈running list末尾的requst请求到preempted_reqs并且添加到waiting queue的前面。
然后处理waiting queue的每个请求。根据num_new_tokens数量(原始长度减去prefix cache长度)分配kv cache block。
request = self.waiting[0]
# Get already-cached tokens.
computed_blocks, num_computed_tokens = self.kv_cache_manager.get_computed_blocks(request)
num_new_tokens = request.num_tokens - num_computed_tokens
new_blocks = self.kv_cache_manager.allocate_slots(request, num_new_tokens, computed_blocks)
self.waiting.popleft()
self.running.append(request)
根据这两个request来源,schedule的输出的request分为了两种:
scheduled_new_reqs: List[NewRequestData],从waiting获取的。
scheduled_cached_reqs: List[CachedRequestData],从running获取的,waiting的request进行一次计算后会变成running状态。
scheduler.schedule()输出样例:
SchedulerOutput(
scheduled_new_reqs=[
NewRequestData(req_id='0', prompt_token_ids=[9707, 11, 847, 829, 374], prompt=None, mm_inputs=[], mm_hashes=[], mm_positions=[],
sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.8, top_p=0.95, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None),
block_ids=[0, 1, 2, 3, 4], num_computed_tokens=0)],
scheduled_cached_reqs=[],
num_scheduled_tokens={'0': 5},
total_num_scheduled_tokens=5, scheduled_encoder_inputs={}, num_common_prefix_blocks=5, finished_req_ids=set(), free_encoder_input_ids=[])
SchedulerOutput(
scheduled_new_reqs=[
NewRequestData(req_id='1', prompt_token_ids=[785, 4767, 315, 279, 3639, 4180, 374], prompt=None, mm_inputs=[], mm_hashes=[], mm_positions=[],
sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.8, top_p=0.95, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None),
block_ids=[5, 6, 7, 8, 9], num_computed_tokens=0),
NewRequestData(req_id='2', prompt_token_ids=[7985, 264, 32794, 911, 5616, 25], prompt=None, mm_inputs=[], mm_hashes=[], mm_positions=[],
sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.8, top_p=0.95, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None),
block_ids=[10, 11, 12, 13, 14], num_computed_tokens=0),
NewRequestData(req_id='3', prompt_token_ids=[15191, 2765, 279, 1879, 4013, 304, 220, 17, 15, 17, 15, 30], prompt=None, mm_inputs=[], mm_hashes=[], mm_positions=[],
sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=0.8, top_p=0.95, top_k=-1, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, guided_decoding=None),
block_ids=[15, 16, 17, 18, 19], num_computed_tokens=0)],
scheduled_cached_reqs=[
CachedRequestData(req_id='0', resumed_from_preemption=False, new_block_ids=[], num_computed_tokens=5)],
num_scheduled_tokens={'0': 1, '1': 7, '2': 6, '3': 12},
total_num_scheduled_tokens=26, scheduled_encoder_inputs={}, num_common_prefix_blocks=0, finished_req_ids=set(), free_encoder_input_ids=[])
SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[
CachedRequestData(req_id='0', resumed_from_preemption=False, new_block_ids=[], num_computed_tokens=6),
CachedRequestData(req_id='1', resumed_from_preemption=False, new_block_ids=[], num_computed_tokens=7),
CachedRequestData(req_id='2', resumed_from_preemption=False, new_block_ids=[], num_computed_tokens=6),
CachedRequestData(req_id='3', resumed_from_preemption=False, new_block_ids=[], num_computed_tokens=12)],
num_scheduled_tokens={'0': 1, '1': 1, '2': 1, '3': 1},
total_num_scheduled_tokens=4, scheduled_encoder_inputs={}, num_common_prefix_blocks=0, finished_req_ids=set(), free_encoder_input_ids=[])
scheduler.schedule并没有进行batch拼接等细节,只是给出了一次计算调度在一起的请求tokens。
对于block_ids的分配,如scheduled_new_reqs的block_ids,scheduled_cached_reqs的new_block_ids,参考:
VLLM V1 part 4 - KV cache block管理
scheduler.update_from_output
获取每个请求新生成的token,以及调度的token数量:
sampled_token_ids = model_runner_output.sampled_token_ids
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
然后对running list的每个请求更新num_computed_tokens,把新token添加到requst从而更新num_tokens:
request.num_computed_tokens += num_tokens_scheduled
request.append_output_token_ids(token_id)
检查请求是否完成,否则释放请求:
stopped = self._check_stop(request)
if stopped:
self._free_request(request)
def _free_request(self, request: Request) -> None:
assert request.is_finished()
self.kv_cache_manager.free(request)
self.encoder_cache_manager.free(request)
self._cached_reqs_data.pop(request.request_id, None)
del self.requests[request.request_id]
self.finished_req_ids.add(request.request_id)
最后把新token id封装为输出返回。
output = EngineCoreOutput(
request_id=req_id,
new_token_ids=request.output_token_ids[-num_new_tokens:],
finished=request.is_finished(),
finish_reason=request.get_finished_reason(),
stop_reason=request.stop_reason)
outputs.append(output)