InternLM · lvhan028 · Sep 19, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 27, 2025
diff --git a/README.md b/README.md
@@ -150,6 +150,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
 </ul>
 </td>

diff --git a/README_ja.md b/README_ja.md
@@ -137,6 +137,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
 </ul>
 </td>
 <td>

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -151,6 +151,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
 </ul>
 </td>

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -307,6 +307,10 @@ def parse_args():
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
     ArgumentHelper.eager_mode(pt_group)
+    ArgumentHelper.dllm_block_length(pt_group)
+    ArgumentHelper.dllm_unmasking_strategy(pt_group)
+    ArgumentHelper.dllm_denoising_steps(pt_group)
+    ArgumentHelper.dllm_confidence_threshold(pt_group)
 
     tp_act = ArgumentHelper.tp(pt_group)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -363,6 +367,10 @@ def main():
             quant_policy=args.quant_policy,
             dtype=args.dtype,
             distributed_executor_backend=args.distributed_executor_backend,
+            dllm_block_length=args.dllm_block_length,
+            dllm_unmasking_strategy=args.dllm_unmasking_strategy,
+            dllm_denoising_steps=args.dllm_denoising_steps,
+            dllm_confidence_threshold=args.dllm_confidence_threshold,
         )
 
     if args.use_uvloop:

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -120,6 +120,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -120,6 +120,7 @@
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -55,6 +55,7 @@ def add_parser_chat():
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
         ArgumentHelper.eager_mode(pt_group)
+        ArgumentHelper.dllm_block_length(pt_group)
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -92,6 +92,10 @@ def add_parser_api_server():
         ArgumentHelper.eager_mode(pt_group)
         ArgumentHelper.disable_vision_encoder(pt_group)
         ArgumentHelper.logprobs_mode(pt_group)
+        ArgumentHelper.dllm_block_length(pt_group)
+        ArgumentHelper.dllm_unmasking_strategy(pt_group)
+        ArgumentHelper.dllm_denoising_steps(pt_group)
+        ArgumentHelper.dllm_confidence_threshold(pt_group)
 
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
@@ -219,6 +223,10 @@ def api_server(args):
                 hf_overrides=args.hf_overrides,
                 disable_vision_encoder=args.disable_vision_encoder,
                 logprobs_mode=args.logprobs_mode,
+                dllm_block_length=args.dllm_block_length,
+                dllm_unmasking_strategy=args.dllm_unmasking_strategy,
+                dllm_denoising_steps=args.dllm_denoising_steps,
+                dllm_confidence_threshold=args.dllm_confidence_threshold,
             )
         else:
             from lmdeploy.messages import TurbomindEngineConfig

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -624,6 +624,36 @@ def logprobs_mode(parser):
                                    choices=[None, 'raw_logits', 'raw_logprobs'],
                                    help='The mode of logprobs.')
 
+    @staticmethod
+    def dllm_block_length(parser):
+        """dllm_block_length for dllm."""
+        return parser.add_argument('--dllm-block-length', type=int, default=None, help='Block length for dllm')
+
+    @staticmethod
+    def dllm_unmasking_strategy(parser):
+        """Dllm unmasking strategy."""
+        return parser.add_argument('--dllm-unmasking-strategy',
+                                   type=str,
+                                   default='low_confidence_dynamic',
+                                   choices=['low_confidence_dynamic', 'low_confidence_static', 'sequential'],
+                                   help='The unmasking strategy for dllm.')
+
+    @staticmethod
+    def dllm_denoising_steps(parser):
+        """Dllm denoising steps."""
+        return parser.add_argument('--dllm-denoising-steps',
+                                   type=int,
+                                   default=None,
+                                   help='The number of denoising steps for dllm.')
+
+    @staticmethod
+    def dllm_confidence_threshold(parser):
+        """Dllm confidence threshold."""
+        return parser.add_argument('--dllm-confidence-threshold',
+                                   type=float,
+                                   default=0.85,
+                                   help='The confidence threshold for dllm.')
+
 
 # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
 class FlexibleArgumentParser(argparse.ArgumentParser):

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -335,6 +335,12 @@ class PytorchEngineConfig:
         disable_vision_encoder (bool): Whether to disable loading vision
             encoder. Default to False.
         logprobs_mode (str): The mode of logprob, options: ['raw_logits', 'raw_logprobs']
+        dllm_block_length (int): Block size of block diffusion model.
+        dllm_unmasking_strategy (str): Dllm unmasking strategy, options:
+            ['low_confidence_dynamic', 'low_confidence_static', 'sequential'].
+        dllm_denoising_steps (int): Dllm denoising steps.
+        dllm_confidence_threshold (float): dllm unmasking threshold for
+            dynamic unmasking.
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -370,6 +376,12 @@ class PytorchEngineConfig:
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
 
+    # dllm
+    dllm_block_length: int = None
+    dllm_unmasking_strategy: str = 'low_confidence_dynamic'
+    dllm_denoising_steps: int = None
+    dllm_confidence_threshold: float = 0.85
+
     role: EngineRole = EngineRole.Hybrid
     migration_backend: MigrationBackend = MigrationBackend.DLSlime
 

diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
@@ -198,7 +198,10 @@ def update_from_output(self, outputs: EngineOutput, req_state: RequestState):
             outputs (EngineOutput): The output from the engine containing information about the current iteration.
             req_state (RequestState): The state of the request, including timestamps and token counts.
         """
-        self.new_generation_tokens = outputs.num_token - req_state.generation_tokens
+        new_generation_tokens = outputs.num_token - req_state.generation_tokens
+        if new_generation_tokens == 0:
+            return
+        self.new_generation_tokens = new_generation_tokens
         if req_state.first_token_time == 0:
             # It means the first token is generated in this iteration
             req_state.first_token_time = outputs.req_metrics.token_timestamp

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -737,7 +737,7 @@ class HFChatTemplate(BaseChatTemplate):
 
     def __init__(self, model_path: str = '', **kwargs):
         try:
-            from transformers import AutoTokenizer, PretrainedConfig
+            from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
             self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
             self.system_start, self.system_end = self._role_instruction('system')
             self.user_start, self.user_end = self._role_instruction('user')
@@ -747,7 +747,10 @@ def __init__(self, model_path: str = '', **kwargs):
                 self.stop_words.append(self.tokenizer.eos_token)
             if hasattr(self.tokenizer, 'eot_token') and self.tokenizer.eot_token is not None:
                 self.stop_words.append(self.tokenizer.eot_token)
-            cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
+            try:
+                cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+            except Exception as e:  # noqa
+                cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
             self.is_gpt_oss = getattr(cfg, 'architectures', [''])[0] == 'GptOssForCausalLM'
             if self.is_gpt_oss:
                 self.stop_words.append('<|call|>')

diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -93,6 +93,7 @@ def build(
         causal: bool = True,
         use_flash_mla: bool = False,
         learnable_sink: bool = False,
+        block_sparse_size: int = 1,
         **kwargs,
     ) -> AttentionImpl[T]:
         """build."""

diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
+import functools
 from dataclasses import dataclass
 from typing import Literal
 
@@ -20,8 +21,8 @@
         assert torch.ops.flash_attn_3 is not None
         use_fa3 = True
 except Exception:
-    logger.warning('For higher performance, please install FlashAttention-3 '
-                   'https://github.com/Dao-AILab/flash-attention')
+    logger.debug('For higher performance, please install FlashAttention-3 '
+                 'https://github.com/Dao-AILab/flash-attention')
 
 
 @dataclass
@@ -62,6 +63,7 @@ def __init__(
         sliding_window: int = None,
         logit_softcapping: float = None,
         causal: bool = True,
+        block_sparse_size: int = 1,
         **kwargs,
     ):
         super().__init__(
@@ -91,6 +93,7 @@ def __init__(
         world_size, rank = get_tp_world_rank()
         self.alibi_head_offset = self.num_heads * rank
         self.alibi_num_heads = self.num_heads * world_size
+        self.block_sparse_size = block_sparse_size
 
     def forward(
         self,
@@ -116,7 +119,7 @@ def forward(
         kv_flatten_size = attn_metadata.kv_flatten_size
         quant_policy = attn_metadata.quant_policy
         if attn_metadata.is_decoding:
-            max_q_seqlen = 1
+            max_q_seqlen = self.block_sparse_size
         else:
             max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
         fill_max_q_seqlen = max_q_seqlen
@@ -213,11 +216,21 @@ def forward(
                 logit_softcapping=self.logit_softcapping,
                 sinks=learnable_sink,
                 causal=self.causal,
+                block_sparse_size=self.block_sparse_size,
             )
 
         return attn_output
 
 
+@functools.lru_cache
+def use_fa3_warning():
+    if use_fa3:
+        return True
+    logger.warning('For higher performance, please install FlashAttention-3 '
+                   'https://github.com/Dao-AILab/flash-attention')
+    return False
+
+
 class FlashMLAImpl(TritonAttentionImpl):
 
     def __init__(
@@ -252,6 +265,7 @@ def __init__(
         from lmdeploy.pytorch.kernels.cuda import flash_mla_fwd
         self.flash_mla_fwd = flash_mla_fwd
         assert num_kv_heads == 1, 'MLA requires num kv heads equal to 1'
+        use_fa3_warning()
 
     def forward(
         self,
@@ -512,6 +526,14 @@ def forward(
         return attn_output
 
 
+@functools.lru_cache
+def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int):
+    enable = not alibi and not learnable_sink and block_sparse_size == 1
+    if enable and not use_fa3_warning():
+        enable = False
+    return enable
+
+
 class TritonAttentionBuilder(AttentionBuilder[TritonAttentionMetadata]):
     """Triton attention builder."""
 
@@ -528,10 +550,13 @@ def build(
         causal: bool = True,
         use_flash_mla: bool = False,
         learnable_sink: bool = False,
+        block_sparse_size: int = 1,
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
+        enable_fa3 = _enable_fa3(alibi, learnable_sink, block_sparse_size)
         if use_flash_mla is True:
+            logger.debug('Build FlashMLAImpl Attention')
             return FlashMLAImpl(num_heads,
                                 head_size,
                                 scale=scale,
@@ -542,7 +567,8 @@ def build(
                                 logical_softcapping=logical_softcapping,
                                 causal=causal,
                                 **kwargs)
-        elif use_fa3 and not alibi and not learnable_sink:
+        elif enable_fa3:
+            logger.debug('Build FA3Impl Attention')
             return FA3Impl(num_heads,
                            head_size,
                            scale=scale,
@@ -554,6 +580,7 @@ def build(
                            causal=causal,
                            **kwargs)
         else:
+            logger.debug('Build TritonAttentionImpl Attention')
             return TritonAttentionImpl(num_heads,
                                        head_size,
                                        scale=scale,
@@ -563,4 +590,5 @@ def build(
                                        sliding_window=sliding_window,
                                        logical_softcapping=logical_softcapping,
                                        causal=causal,
+                                       block_sparse_size=block_sparse_size,
                                        **kwargs)
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -9,9 +9,11 @@
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
 from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
 from lmdeploy.pytorch.models.utils.cudagraph import CudaGraphMeta
+from lmdeploy.pytorch.strategies.base import StrategyFactoryBase
 from lmdeploy.utils import get_logger
 
 from ..graph_runner import GraphRunner
+from .attention import TritonAttentionMetadata
 
 logger = get_logger('lmdeploy')
 
@@ -146,6 +148,11 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
         self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict()
         self.has_try_compile_model: bool = False
 
+        # strategy factory
+        build_ctx = model.ctx_mgr.build_ctx
+        strategy_factory: StrategyFactoryBase = build_ctx.strategy_factory
+        self.cudagraph_strategy = strategy_factory.build_cudagraph_strategy()
+
     def check_enable_graph(self):
         """Check enable graph."""
         if self.backend_config.eager_mode:
@@ -173,18 +180,24 @@ def _get_capture_tokens(self, batch_size: int):
         assert False, f'Unsupported batch_size={batch_size}'
 
     def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List,
-                      attn_metadata: Any, inputs_embeds: torch.Tensor, **kwargs):
+                      attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs):
         """Get graph key."""
         context = self.ctx_mgr.current_context()
         is_decoding = context.is_decoding
-        num_tokens = input_ids.numel()
+        batch_size = attn_metadata.q_seqlens.size(0)
         meta = self.get_meta()
         enable_microbatch = get_step_ctx_manager().current_context().enable_microbatch
         if meta.padding_batch_size is None:
-            new_num_tokens = self._get_capture_tokens(num_tokens)
+            batch_size = self._get_capture_tokens(batch_size)
         else:
-            new_num_tokens = self._get_capture_tokens(meta.padding_batch_size)
-        return (new_num_tokens, is_decoding, enable_microbatch)
+            batch_size = self._get_capture_tokens(meta.padding_batch_size)
+        return (batch_size, is_decoding, enable_microbatch)
+
+    def _get_max_tokens(self, graph_key: tuple):
+        max_batches = graph_key[0]
+        is_decoding = graph_key[1]
+        assert is_decoding
+        return self.cudagraph_strategy.get_max_tokens(max_batches)
 
     def __call__(self, **kwargs):
         """call."""
@@ -198,10 +211,10 @@ def __call__(self, **kwargs):
                 return self.model(**kwargs)
 
         graph_key = self.get_graph_key(**kwargs)
-        max_tokens = graph_key[0]
+        max_batches = graph_key[0]
         is_decoding = graph_key[1]
         if graph_key not in self._runner_map:
-            max_batches = max_tokens if is_decoding else self.max_batches
+            max_tokens = self._get_max_tokens(graph_key)
             runner = CUDASingleGraphRunner(self.model,
                                            max_batches=max_batches,
                                            max_tokens=max_tokens,

diff --git a/lmdeploy/pytorch/check_env/transformers.py b/lmdeploy/pytorch/check_env/transformers.py
@@ -4,7 +4,7 @@
 from .base import BaseChecker
 
 MIN_TRANSFORMERS_VERSION = '4.33.0'
-MAX_TRANSFORMERS_VERSION = '4.53.3'
+MAX_TRANSFORMERS_VERSION = '4.56.1'
 
 
 class TransformersChecker(BaseChecker):