@@ -117,8 +117,8 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re
117
117
preempted_req = self .running .pop ()
118
118
preempted_req .status = RequestStatus .PREEMPTED
119
119
preempted_req .num_computed_tokens = 0
120
- preempted_req .prefill_block_num = 0
121
120
self ._free_blocks (preempted_req )
121
+ preempted_req .prefill_block_num = None
122
122
self .to_be_rescheduled_request_id_set .add (preempted_req .request_id )
123
123
preempted_reqs .append (preempted_req )
124
124
scheduled_reqs .append (self ._prepare_preempt_task (preempted_req ))
@@ -305,6 +305,7 @@ def schedule(self):
305
305
if self .config .cache_config .enable_prefix_caching :
306
306
success = self .get_prefix_cached_blocks (request )
307
307
if not success :
308
+ self ._free_blocks (request )
308
309
break
309
310
310
311
num_new_tokens = self ._get_num_new_tokens (request , token_budget )
@@ -327,23 +328,33 @@ def schedule(self):
327
328
self .stop_flags [allocated_position ] = False
328
329
self .req_dict [request .request_id ] = allocated_position
329
330
else :
331
+ if self .config .cache_config .enable_prefix_caching :
332
+ self ._free_blocks (request )
330
333
break
331
334
elif request .status == RequestStatus .PREEMPTED :
332
335
request .need_prefill_tokens = (
333
336
request .num_total_tokens
334
337
) # Before preempted task rescheduled, preempted task has been sent to engine, no more tokens are output, here num_total_tokens should be static and correct
338
+ if self .config .cache_config .enable_prefix_caching :
339
+ success = self .get_prefix_cached_blocks (request )
340
+ if not success :
341
+ self ._free_blocks (request )
342
+ break
335
343
num_new_tokens = self ._get_num_new_tokens (request , token_budget )
336
344
num_new_block = self .get_new_block_nums (request , num_new_tokens )
337
345
# Allocate blocks to prefill
338
346
if self .cache_manager .can_allocate_gpu_blocks (num_new_block ):
339
- request .block_tables .extend (self .cache_manager .allocate_gpu_blocks (num_new_block ))
347
+ if not request .get ("skip_allocate" , False ):
348
+ request .block_tables .extend (self .cache_manager .allocate_gpu_blocks (num_new_block ))
340
349
self .waiting .popleft ()
341
350
self .running .append (request )
342
351
scheduled_reqs .append (self ._prepare_prefill_task (request , num_new_tokens ))
343
352
token_budget -= num_new_tokens
344
353
request .num_computed_tokens += num_new_tokens
345
354
request .status = RequestStatus .RUNNING
346
355
else :
356
+ if self .config .cache_config .enable_prefix_caching :
357
+ self ._free_blocks (request )
347
358
break
348
359
else :
349
360
llm_logger .error ("Unknown request status type" )
@@ -399,7 +410,7 @@ def get_prefix_cached_blocks(self, request: Request):
399
410
main_process_metrics .prefix_cpu_cache_token_num .inc (request .cpu_cache_token_num )
400
411
401
412
if matched_token_num == request .prompt_token_ids_len :
402
- request .num_computed_tokens = matched_token_num - 1
413
+ request .num_computed_tokens = matched_token_num - self . config . cache_config . block_size
403
414
request .skip_allocate = True
404
415
else :
405
416
request .num_computed_tokens = matched_token_num
@@ -417,8 +428,15 @@ def add_request(self, request: Request) -> None:
417
428
def _free_blocks (self , request : Request ):
418
429
if self .config .cache_config .enable_prefix_caching :
419
430
# TODO(chengyanfu): support cache ouput blocks for prefix caching
420
- self .cache_manager .release_block_ids_async (request )
421
- self .cache_manager .recycle_gpu_blocks (request .block_tables [request .prefill_block_num :])
431
+ if request .get ("prefill_block_num" , None ) is None :
432
+ leaf_node = self .cache_manager .req_leaf_map [request .request_id ]
433
+ self .cache_manager .decrease_request_share_count (request .request_id )
434
+ self .cache_manager .free_nodes_directly (leaf_node )
435
+ self .cache_manager .recycle_gpu_blocks (request .block_tables [request .cache_info [0 ]:])
436
+
437
+ else :
438
+ self .cache_manager .release_block_ids_async (request )
439
+ self .cache_manager .recycle_gpu_blocks (request .block_tables [request .prefill_block_num :])
422
440
else :
423
441
self .cache_manager .recycle_gpu_blocks (request .block_tables )
424
442
request .block_tables = []
0 commit comments