vllm-project · vadiklyutiy · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 5, 2025
@@ -16,8 +16,7 @@
 @pytest.fixture(autouse=True)
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
-    _cached_get_attn_backend.cache_clear()
-
+    pass
 
 # Define MLA and non-MLA backends separately
 DEVICE_MLA_BACKENDS = {

@@ -23,7 +23,7 @@
 @pytest.fixture(autouse=True)
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
-    _cached_get_attn_backend.cache_clear()
+    pass
     # Clear xformers availability cache
     import vllm.attention.layer as layer_module
 

@@ -12,8 +12,7 @@
 @pytest.fixture(autouse=True)
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
-    _cached_get_attn_backend.cache_clear()
-
+    pass
 
 @pytest.mark.skip(reason="Skipped for now. Should be revisited.")
 def test_selector(monkeypatch: pytest.MonkeyPatch):

diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py
@@ -20,8 +20,7 @@
 @pytest.fixture(autouse=True)
 def clear_cache():
     """Clear lru cache to ensure each test case runs without caching."""
-    _cached_get_attn_backend.cache_clear()
-
+    pass
 
 def ref_attention(
     query: torch.Tensor,

diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
@@ -790,55 +790,56 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
         cache_dtype="auto",
     )
     parallel_config = ParallelConfig()
-    vllm_config = VllmConfig(
-        model_config=model_config,
-        cache_config=cache_config,
-        scheduler_config=scheduler_config,
-        parallel_config=parallel_config,
-    )
-
-    layer_0 = "model.layers.0.self_attn.attn"
-    layer_1 = "model.layers.1.self_attn.attn"
-    layer_2 = "model.layers.2.mixer"
-    layer_3 = "model.layers.3.mixer"
-    layer_4 = "model.layers.4.mixer"
-    layer_5 = "model.layers.5.mixer"
-
-    with set_current_vllm_config(vllm_config), monkeypatch.context() as m:
-        m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
-        hf_config = vllm_config.model_config.hf_config
-        fwd_context = {}
-        for key in [layer_0, layer_1]:
-            fwd_context[key] = Attention(
-                num_heads=model_config.get_num_attention_heads(parallel_config),
-                num_kv_heads=model_config.get_num_kv_heads(parallel_config),
-                head_size=model_config.get_head_size(),
-                scale=1.0,
-                prefix=key,
-            )
-        for key in [layer_2, layer_3, layer_4, layer_5]:
-            fwd_context[key] = MambaMixer2(
-                hidden_size=hf_config.hidden_size,
-                ssm_state_size=hf_config.mamba_d_state,
-                conv_kernel_size=hf_config.mamba_d_conv,
-                intermediate_size=hf_config.mamba_expand * hf_config.hidden_size,
-                use_conv_bias=hf_config.mamba_conv_bias,
-                use_bias=hf_config.mamba_proj_bias,
-                n_groups=hf_config.mamba_n_groups,
-                num_heads=hf_config.mamba_n_heads,
-                head_dim=hf_config.mamba_d_head,
-                rms_norm_eps=hf_config.rms_norm_eps,
-                activation=hf_config.hidden_act,
-                cache_config=cache_config,
-                model_config=model_config,
-                prefix=key,
-            )
-        # suppress var not used error
-        assert fwd_context is not None
-    vllm_ctx = vllm_config.compilation_config.static_forward_context
-
     with monkeypatch.context() as m:
+        # Attention backend should be set before creating VllmConfig because 
+        # VllmConfig will determine the kv block size based on the attention backend
         m.setenv("VLLM_ATTENTION_BACKEND", "FLASHINFER")
+        vllm_config = VllmConfig(
+            model_config=model_config,
+            cache_config=cache_config,
+            scheduler_config=scheduler_config,
+            parallel_config=parallel_config,
+        )
+
+        layer_0 = "model.layers.0.self_attn.attn"
+        layer_1 = "model.layers.1.self_attn.attn"
+        layer_2 = "model.layers.2.mixer"
+        layer_3 = "model.layers.3.mixer"
+        layer_4 = "model.layers.4.mixer"
+        layer_5 = "model.layers.5.mixer"
+
+        with set_current_vllm_config(vllm_config):
+            hf_config = vllm_config.model_config.hf_config
+            fwd_context = {}
+            for key in [layer_0, layer_1]:
+                fwd_context[key] = Attention(
+                    num_heads=model_config.get_num_attention_heads(parallel_config),
+                    num_kv_heads=model_config.get_num_kv_heads(parallel_config),
+                    head_size=model_config.get_head_size(),
+                    scale=1.0,
+                    prefix=key,
+                )
+            for key in [layer_2, layer_3, layer_4, layer_5]:
+                fwd_context[key] = MambaMixer2(
+                    hidden_size=hf_config.hidden_size,
+                    ssm_state_size=hf_config.mamba_d_state,
+                    conv_kernel_size=hf_config.mamba_d_conv,
+                    intermediate_size=hf_config.mamba_expand * hf_config.hidden_size,
+                    use_conv_bias=hf_config.mamba_conv_bias,
+                    use_bias=hf_config.mamba_proj_bias,
+                    n_groups=hf_config.mamba_n_groups,
+                    num_heads=hf_config.mamba_n_heads,
+                    head_dim=hf_config.mamba_d_head,
+                    rms_norm_eps=hf_config.rms_norm_eps,
+                    activation=hf_config.hidden_act,
+                    cache_config=cache_config,
+                    model_config=model_config,
+                    prefix=key,
+                )
+            # suppress var not used error
+            assert fwd_context is not None
+        vllm_ctx = vllm_config.compilation_config.static_forward_context
+
 
         runner = GPUModelRunner(vllm_config, DEVICE)
         kv_cache_spec = runner.get_kv_cache_spec()

@@ -5,7 +5,6 @@
 from collections.abc import Generator
 from contextlib import contextmanager
 from dataclasses import dataclass
-from functools import cache
 
 import torch
 
@@ -145,7 +144,6 @@ def get_attn_backend(
     )
 
 
-@cache
 def _cached_get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -236,4 +234,3 @@ def global_force_attn_backend_context_manager(
     finally:
         # Revert the original global backend override, if any
         global_force_attn_backend(original_value)
-        _cached_get_attn_backend.cache_clear()
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional, Tuple
+
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -12,6 +14,12 @@
 
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+    from vllm.vllm_flash_attn.flash_attn_interface import (
+        FA2_AVAILABLE,
+        FA2_UNAVAILABLE_REASON,
+        FA3_AVAILABLE,
+        FA3_UNAVAILABLE_REASON,
+    )
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
 
@@ -20,18 +28,63 @@
     get_scheduler_metadata = ops.get_scheduler_metadata
 
 
+# Functions copied from vllm/vllm_flash_attn/flash_attn_interface.py
+# Modified to use current_platform.get_device_capability() instead of
+# torch.cuda.get_device_capability(device) because current_platform.get_device_capability()
+# does not initialize CUDA.
+def _is_fa2_supported(device=None) -> Tuple[bool, Optional[str]]:
+    if not FA2_AVAILABLE:
+        return False, f"FA2 is unavaible due to: {FA2_UNAVAILABLE_REASON}"
+    device_capability = current_platform.get_device_capability()
+    if device_capability.major < 8:
+        return (
+            False,
+            "FA2 is only supported on devices with compute capability >= 8",
+        )
+    return True, None
+
+
+def _is_fa3_supported(device=None) -> Tuple[bool, Optional[str]]:
+    if not FA3_AVAILABLE:
+        return False, f"FA3 is unavaible due to: {FA3_UNAVAILABLE_REASON}"
+    device_capability = current_platform.get_device_capability()
+    if (
+        device_capability.major < 8
+        or device_capability.major >= 10
+        or device_capability == (8, 6)
+        or device_capability == (8, 9)
+    ):
+        return (
+            False,
+            "FA3 is only supported on devices with compute capability >= 8"
+            " excluding 8.6 and 8.9 and Blackwell archs (>=10)",
+        )
+    return True, None
+
+
+def is_fa_version_supported(fa_version: int, device=None) -> bool:
+    assert fa_version in [2, 3], f"Unsupported FA version: {fa_version}"
+    if fa_version == 2:
+        return _is_fa2_supported(device)[0]
+    elif fa_version == 3:
+        return _is_fa3_supported(device)[0]
+
+
+def fa_version_unsupported_reason(fa_version: int, device=None) -> Optional[str]:
+    assert fa_version in [2, 3], f"Unsupported FA version: {fa_version}"
+    if fa_version == 2:
+        return _is_fa2_supported(device)[1]
+    elif fa_version == 3:
+        return _is_fa3_supported(device)[1]
+
+
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
         return 2
     try:
-        from vllm.vllm_flash_attn.flash_attn_interface import (
-            fa_version_unsupported_reason,
-            is_fa_version_supported,
-        )
-
         device_capability = current_platform.get_device_capability()
 
         assert device_capability is not None

@@ -880,6 +880,10 @@ def try_verify_and_update_config(self):
         if architecture is None:
             return
 
+        from vllm.model_executor.layers.config import AttentionConfig
+
+        AttentionConfig.verify_and_update_config(self)
+
         from vllm.model_executor.models.config import (
             MODELS_CONFIG_MAP,
             HybridAttentionMambaModelConfig,

diff --git a/vllm/model_executor/layers/config.py b/vllm/model_executor/layers/config.py
@@ -0,0 +1,57 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from math import lcm
+from typing import TYPE_CHECKING
+
+from vllm.attention.backends.abstract import MultipleOf
+from vllm.attention.selector import get_attn_backend
+from vllm.logger import init_logger
+from vllm.model_executor.models.config import VerifyAndUpdateConfig
+
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+
+logger = init_logger(__name__)
+
+
+class AttentionConfig(VerifyAndUpdateConfig):
+    @classmethod
+    def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
+        """
+        Align cache_config.block_size with attention backend's minimum
+        supported kernel block size.
+
+        Args:
+            vllm_config: vLLM Config
+        """
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        assert cache_config is not None
+
+        backend_cls = get_attn_backend(
+            head_size=model_config.get_head_size(),
+            dtype=model_config.dtype,
+            kv_cache_dtype=cache_config.cache_dtype,
+            block_size=cache_config.block_size
+            if cache_config.block_size is not None
+            else 16,
+        )
+
+        supported_sizes = backend_cls.get_supported_kernel_block_size()
+        supported_sizes = [
+            s.base if isinstance(s, MultipleOf) else s for s in supported_sizes
+        ]
+        min_size = min(supported_sizes)
+        if cache_config.block_size is None:
+            new_block_size = min_size
+        else:
+            new_block_size = lcm(cache_config.block_size, min_size)
+
+        if cache_config.block_size is None or new_block_size != cache_config.block_size:
+            cache_config.block_size = new_block_size
+            logger.info(
+                "Setting attention block size to %d tokens "
+                "to align with %s attention backend's supported kernel block sizes.",
+                new_block_size,
+                backend_cls.get_name(),
+            )
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -356,7 +356,11 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 dtype=kv_cache_dtype,
             ).page_size_bytes
         else:
-            kernel_block_alignment_size = 16
+            if cache_config.block_size is not None:
+                kernel_block_alignment_size = cache_config.block_size
+            else:
+                kernel_block_alignment_size = 16
+
             if (
                 current_platform.is_device_capability(100)
                 and model_config.get_head_size() == 256

@@ -173,7 +173,10 @@ def get_supported_kernel_block_size() -> list[int | MultipleOf]:
         # Note: Not sure for all platforms,
         # but on Blackwell, only support a page size of
         # 16, 32, 64
-        return [16, 32, 64]
+        # TODO: 16 is temporary removed because TRT-LLM kernel has a bug when using 16.
+        # See https://github.com/flashinfer-ai/flashinfer/issues/1993
+        # for more details.
+        return [32, 64]
 
     @classmethod
     def validate_head_size(cls, head_size: int) -> None: