Skip to content

Commit c397cfb

Browse files
committed
fix
Signed-off-by: Vadim Gimpelson <[email protected]>
1 parent f32f250 commit c397cfb

File tree

1 file changed

+2
-1
lines changed

1 file changed

+2
-1
lines changed

vllm/model_executor/models/config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,7 @@ def lcm(a, b):
411411

412412
chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
413413
attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
414+
attn_block_size = next_power_of_2(attn_block_size)
414415
cache_config.mamba_block_size = attn_block_size
415416
else:
416417
# Without prefix caching, select minimum valid attention block size
@@ -422,12 +423,12 @@ def lcm(a, b):
422423
attn_block_size = kernel_block_alignment_size * cdiv(
423424
mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
424425
)
426+
attn_block_size = next_power_of_2(attn_block_size)
425427

426428
# override attention block size if either (a) the
427429
# user has not set it or (b) the user has set it
428430
# too small.
429431
if cache_config.block_size is None or cache_config.block_size < attn_block_size:
430-
attn_block_size = next_power_of_2(attn_block_size)
431432
cache_config.block_size = attn_block_size
432433
logger.info(
433434
"Setting attention block size to %d tokens "

0 commit comments

Comments
 (0)