avoid cuda initalisation during AttentionConfig.verify_and_update_config call

vadiklyutiy · vadiklyutiy · commit 3a5181483a14 · 2025-11-06T01:03:37.000+04:00
Signed-off-by: Vadim Gimpelson &lt;vadim.gimpelson@gmail.com&gt;
diff --git a/vllm/attention/utils/fa_utils.py b/vllm/attention/utils/fa_utils.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+from typing import Optional, Tuple
+
 from vllm import envs
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
@@ -12,6 +14,12 @@
 
     reshape_and_cache_flash = ops.reshape_and_cache_flash
     from vllm.vllm_flash_attn import flash_attn_varlen_func, get_scheduler_metadata
+    from vllm.vllm_flash_attn.flash_attn_interface import (
+        FA2_AVAILABLE,
+        FA2_UNAVAILABLE_REASON,
+        FA3_AVAILABLE,
+        FA3_UNAVAILABLE_REASON,
+    )
 elif current_platform.is_xpu():
     from vllm._ipex_ops import ipex_ops as ops
 
@@ -20,18 +28,63 @@
     get_scheduler_metadata = ops.get_scheduler_metadata
 
 
+# Functions copied from vllm/vllm_flash_attn/flash_attn_interface.py
+# Modified to use current_platform.get_device_capability() instead of
+# torch.cuda.get_device_capability(device) because current_platform.get_device_capability()
+# does not initialize CUDA.
+def _is_fa2_supported(device=None) -> Tuple[bool, Optional[str]]:
+    if not FA2_AVAILABLE:
+        return False, f"FA2 is unavaible due to: {FA2_UNAVAILABLE_REASON}"
+    device_capability = current_platform.get_device_capability()
+    if device_capability.major < 8:
+        return (
+            False,
+            "FA2 is only supported on devices with compute capability >= 8",
+        )
+    return True, None
+
+
+def _is_fa3_supported(device=None) -> Tuple[bool, Optional[str]]:
+    if not FA3_AVAILABLE:
+        return False, f"FA3 is unavaible due to: {FA3_UNAVAILABLE_REASON}"
+    device_capability = current_platform.get_device_capability()
+    if (
+        device_capability.major < 8
+        or device_capability.major >= 10
+        or device_capability == (8, 6)
+        or device_capability == (8, 9)
+    ):
+        return (
+            False,
+            "FA3 is only supported on devices with compute capability >= 8"
+            " excluding 8.6 and 8.9 and Blackwell archs (>=10)",
+        )
+    return True, None
+
+
+def is_fa_version_supported(fa_version: int, device=None) -> bool:
+    assert fa_version in [2, 3], f"Unsupported FA version: {fa_version}"
+    if fa_version == 2:
+        return _is_fa2_supported(device)[0]
+    elif fa_version == 3:
+        return _is_fa3_supported(device)[0]
+
+
+def fa_version_unsupported_reason(fa_version: int, device=None) -> Optional[str]:
+    assert fa_version in [2, 3], f"Unsupported FA version: {fa_version}"
+    if fa_version == 2:
+        return _is_fa2_supported(device)[1]
+    elif fa_version == 3:
+        return _is_fa3_supported(device)[1]
+
+
 def get_flash_attn_version(requires_alibi: bool = False) -> int | None:
     # import here to avoid circular dependencies
     from vllm.platforms import current_platform
 
     if current_platform.is_xpu():
         return 2
     try:
-        from vllm.vllm_flash_attn.flash_attn_interface import (
-            fa_version_unsupported_reason,
-            is_fa_version_supported,
-        )
-
         device_capability = current_platform.get_device_capability()
 
         assert device_capability is not None