fix

vadiklyutiy · vadiklyutiy · commit 1ca933cab9a1 · 2025-10-31T05:16:57.000+04:00
Signed-off-by: Vadim Gimpelson &lt;vadim.gimpelson@gmail.com&gt;
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -410,6 +410,7 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             attn_tokens_per_mamba_state = cdiv(mamba_page_size, attn_page_size_1_token)
             chunk_size = lcm(base_chunk_size, kernel_block_alignment_size)
             attn_block_size = chunk_size * cdiv(attn_tokens_per_mamba_state, chunk_size)
+            attn_block_size = next_power_of_2(attn_block_size)
             cache_config.mamba_block_size = attn_block_size
         else:
             # Without prefix caching, select minimum valid attention block size
@@ -421,12 +422,12 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             attn_block_size = kernel_block_alignment_size * cdiv(
                 mamba_page_size, kernel_block_alignment_size * attn_page_size_1_token
             )
+            attn_block_size = next_power_of_2(attn_block_size)
 
         # override attention block size if either (a) the
         # user has not set it or (b) the user has set it
         # too small.
         if cache_config.block_size is None or cache_config.block_size < attn_block_size:
-            attn_block_size = next_power_of_2(attn_block_size)
             cache_config.block_size = attn_block_size
             logger.info(
                 "Setting attention block size to %d tokens "