suppression warning

grimoire · grimoire · commit 59c7c62ccffa · 2025-09-18T17:31:58.000+08:00
diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+import functools
 from dataclasses import dataclass
 from typing import Literal
 
@@ -20,8 +21,8 @@
         assert torch.ops.flash_attn_3 is not None
         use_fa3 = True
 except Exception:
-    logger.warning('For higher performance, please install FlashAttention-3 '
-                   'https://github.com/Dao-AILab/flash-attention')
+    logger.debug('For higher performance, please install FlashAttention-3 '
+                 'https://github.com/Dao-AILab/flash-attention')
 
 
 @dataclass
@@ -221,6 +222,15 @@ def forward(
         return attn_output
 
 
+@functools.lru_cache
+def use_fa3_warning():
+    if use_fa3:
+        return True
+    logger.warning('For higher performance, please install FlashAttention-3 '
+                   'https://github.com/Dao-AILab/flash-attention')
+    return False
+
+
 class FlashMLAImpl(TritonAttentionImpl):
 
     def __init__(
@@ -255,6 +265,7 @@ def __init__(
         from lmdeploy.pytorch.kernels.cuda import flash_mla_fwd
         self.flash_mla_fwd = flash_mla_fwd
         assert num_kv_heads == 1, 'MLA requires num kv heads equal to 1'
+        use_fa3_warning()
 
     def forward(
         self,
@@ -515,6 +526,14 @@ def forward(
         return attn_output
 
 
+@functools.lru_cache
+def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int):
+    enable = not alibi and not learnable_sink and block_sparse_size == 1
+    if enable and not use_fa3_warning():
+        enable = False
+    return enable
+
+
 class TritonAttentionBuilder(AttentionBuilder[TritonAttentionMetadata]):
     """Triton attention builder."""
 
@@ -535,8 +554,9 @@ def build(
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
-        enable_fa3 = use_fa3 and not alibi and not learnable_sink and block_sparse_size == 1
+        enable_fa3 = _enable_fa3(alibi, learnable_sink, block_sparse_size)
         if use_flash_mla is True:
+            logger.debug('Build FlashMLAImpl Attention')
             return FlashMLAImpl(num_heads,
                                 head_size,
                                 scale=scale,
@@ -548,6 +568,7 @@ def build(
                                 causal=causal,
                                 **kwargs)
         elif enable_fa3:
+            logger.debug('Build FA3Impl Attention')
             return FA3Impl(num_heads,
                            head_size,
                            scale=scale,
@@ -559,6 +580,7 @@ def build(
                            causal=causal,
                            **kwargs)
         else:
+            logger.debug('Build TritonAttentionImpl Attention')
             return TritonAttentionImpl(num_heads,
                                        head_size,
                                        scale=scale,
diff --git a/lmdeploy/pytorch/check_env/transformers.py b/lmdeploy/pytorch/check_env/transformers.py
@@ -4,7 +4,7 @@
 from .base import BaseChecker
 
 MIN_TRANSFORMERS_VERSION = '4.33.0'
-MAX_TRANSFORMERS_VERSION = '4.53.3'
+MAX_TRANSFORMERS_VERSION = '4.56.1'
 
 
 class TransformersChecker(BaseChecker):
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -28,7 +28,10 @@ def _update_torch_dtype(config: 'ModelConfig', dtype: str):
         config.dtype = torch.float16
         return config
 
-    torch_dtype = getattr(config.hf_config, 'torch_dtype', None)
+    torch_dtype = getattr(config.hf_config, 'dtype', None)
+    if torch_dtype is None:
+        torch_dtype = getattr(config.hf_config, 'torch_dtype', None)
+
     # deal with case when torch_dtype is not string but torch.dtype
     if isinstance(torch_dtype, torch.dtype):
         torch_dtype = str(torch_dtype).split('.')[1]
diff --git a/lmdeploy/pytorch/disagg/backend/__init__.py b/lmdeploy/pytorch/disagg/backend/__init__.py
@@ -7,7 +7,7 @@
     logger.debug('Registering DLSlime Backend')
     from .dlslime import DLSlimeBackend
 except ImportError:
-    logger.warning('Disable DLSlime Backend')
+    logger.debug('Disable DLSlime Backend')
 
 try:
     logger.debug('Registering Mooncake Backend')
diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
@@ -423,8 +423,11 @@ class Tokenizer:
     """
 
     def __init__(self, model_path: str):
-        from transformers import PretrainedConfig
-        model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
+        from transformers import AutoConfig, PretrainedConfig
+        try:
+            model_cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+        except BaseException:
+            model_cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
         is_gpt_oss = getattr(model_cfg, 'model_type', '') == 'gpt_oss'
         from transformers.models.auto.tokenization_auto import get_tokenizer_config
         tokenizer_config = get_tokenizer_config(model_path, trust_remote_code=True)