fix

vadiklyutiy · vadiklyutiy · commit c397cfbd5734 · 2025-10-29T05:45:31.000+04:00
Signed-off-by: Vadim Gimpelson &lt;vadim.gimpelson@gmail.com&gt;
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -411,6 +411,7 @@ def lcm(a, b):
 
             chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
             attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
+            attn_block_size = next_power_of_2(attn_block_size)
             cache_config.mamba_block_size = attn_block_size
         else:
             # Without prefix caching, select minimum valid attention block size
@@ -422,12 +423,12 @@ def lcm(a, b):
             attn_block_size = kernel_block_alignment_size * cdiv(
                 mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
             )
+            attn_block_size = next_power_of_2(attn_block_size)
 
         # override attention block size if either (a) the
         # user has not set it or (b) the user has set it
         # too small.
         if cache_config.block_size is None or cache_config.block_size < attn_block_size:
-            attn_block_size = next_power_of_2(attn_block_size)
             cache_config.block_size = attn_block_size
             logger.info(
                 "Setting attention block size to %d tokens "