InternLM · grimoire · Aug 26, 2025 · Aug 26, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/README.md b/README.md
@@ -150,6 +150,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
 </ul>
 </td>

diff --git a/README_ja.md b/README_ja.md
@@ -137,6 +137,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
 </ul>
 </td>
 <td>

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -151,6 +151,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Phi-3.5-MoE (16x3.8B)</li>
   <li>Phi-4-mini (3.8B)</li>
   <li>MiniCPM3 (4B)</li>
+  <li>SDAR (1.7B-30B)</li>
   <li>gpt-oss (20B, 120B)</li>
 </ul>
 </td>

diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -307,6 +307,10 @@ def parse_args():
     # pytorch engine args
     pt_group = parser.add_argument_group('PyTorch engine arguments')
     ArgumentHelper.eager_mode(pt_group)
+    ArgumentHelper.dllm_block_length(pt_group)
+    ArgumentHelper.dllm_unmasking_strategy(pt_group)
+    ArgumentHelper.dllm_denoising_steps(pt_group)
+    ArgumentHelper.dllm_confidence_threshold(pt_group)
 
     tp_act = ArgumentHelper.tp(pt_group)
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -363,6 +367,10 @@ def main():
             quant_policy=args.quant_policy,
             dtype=args.dtype,
             distributed_executor_backend=args.distributed_executor_backend,
+            dllm_block_length=args.dllm_block_length,
+            dllm_unmasking_strategy=args.dllm_unmasking_strategy,
+            dllm_denoising_steps=args.dllm_denoising_steps,
+            dllm_confidence_threshold=args.dllm_confidence_threshold,
         )
 
     if args.use_uvloop:

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -118,6 +118,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -118,6 +118,7 @@
 |          Phi-3.5-mini          |      3.8B       | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |          Phi-3.5-MoE           |     16x3.8B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 |         Phi-3.5-vision         |      4.2B       | MLLM |    Yes    |   Yes   |   No    |  -   |   -   |
+|              SDAR              |    1.7B-30B     | LLM  |    Yes    |   Yes   |   No    |  -   |   -   |
 
 ```{note}
 * [1] 目前，Mono-InternVL不支持FP16，因为数值不稳定。请改用BF16

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -55,6 +55,7 @@ def add_parser_chat():
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
         ArgumentHelper.eager_mode(pt_group)
+        ArgumentHelper.dllm_block_length(pt_group)
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -92,6 +92,10 @@ def add_parser_api_server():
         ArgumentHelper.eager_mode(pt_group)
         ArgumentHelper.disable_vision_encoder(pt_group)
         ArgumentHelper.logprobs_mode(pt_group)
+        ArgumentHelper.dllm_block_length(pt_group)
+        ArgumentHelper.dllm_unmasking_strategy(pt_group)
+        ArgumentHelper.dllm_denoising_steps(pt_group)
+        ArgumentHelper.dllm_confidence_threshold(pt_group)
 
         # common engine args
         dtype_act = ArgumentHelper.dtype(pt_group)
@@ -219,6 +223,10 @@ def api_server(args):
                 hf_overrides=args.hf_overrides,
                 disable_vision_encoder=args.disable_vision_encoder,
                 logprobs_mode=args.logprobs_mode,
+                dllm_block_length=args.dllm_block_length,
+                dllm_unmasking_strategy=args.dllm_unmasking_strategy,
+                dllm_denoising_steps=args.dllm_denoising_steps,
+                dllm_confidence_threshold=args.dllm_confidence_threshold,
             )
         else:
             from lmdeploy.messages import TurbomindEngineConfig

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -624,6 +624,36 @@ def logprobs_mode(parser):
                                    choices=[None, 'raw_logits', 'raw_logprobs'],
                                    help='The mode of logprobs.')
 
+    @staticmethod
+    def dllm_block_length(parser):
+        """dllm_block_length for dllm."""
+        return parser.add_argument('--dllm-block-length', type=int, default=None, help='Block length for dllm')
+
+    @staticmethod
+    def dllm_unmasking_strategy(parser):
+        """Dllm unmasking strategy."""
+        return parser.add_argument('--dllm-unmasking-strategy',
+                                   type=str,
+                                   default='low_confidence_dynamic',
+                                   choices=['low_confidence_dynamic', 'low_confidence_static', 'sequential'],
+                                   help='The unmasking strategy for dllm.')
+
+    @staticmethod
+    def dllm_denoising_steps(parser):
+        """Dllm denoising steps."""
+        return parser.add_argument('--dllm-denoising-steps',
+                                   type=int,
+                                   default=None,
+                                   help='The number of denoising steps for dllm.')
+
+    @staticmethod
+    def dllm_confidence_threshold(parser):
+        """Dllm confidence threshold."""
+        return parser.add_argument('--dllm-confidence-threshold',
+                                   type=float,
+                                   default=0.85,
+                                   help='The confidence threshold for dllm.')
+
 
 # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
 class FlexibleArgumentParser(argparse.ArgumentParser):

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -335,6 +335,12 @@ class PytorchEngineConfig:
         disable_vision_encoder (bool): Whether to disable loading vision
             encoder. Default to False.
         logprobs_mode (str): The mode of logprob, options: ['raw_logits', 'raw_logprobs']
+        dllm_block_length (int): Block size of block diffusion model.
+        dllm_unmasking_strategy (str): Dllm unmasking strategy, options:
+            ['low_confidence_dynamic', 'low_confidence_static', 'sequential'].
+        dllm_denoising_steps (int): Dllm denoising steps.
+        dllm_confidence_threshold (float): dllm unmasking threshold for
+            dynamic unmasking.
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -370,6 +376,12 @@ class PytorchEngineConfig:
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
 
+    # dllm
+    dllm_block_length: int = None
+    dllm_unmasking_strategy: str = 'low_confidence_dynamic'
+    dllm_denoising_steps: int = None
+    dllm_confidence_threshold: float = 0.85
+
     role: EngineRole = EngineRole.Hybrid
     migration_backend: MigrationBackend = MigrationBackend.DLSlime
 

diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -93,6 +93,7 @@ def build(
         causal: bool = True,
         use_flash_mla: bool = False,
         learnable_sink: bool = False,
+        block_sparse_size: int = 1,
         **kwargs,
     ) -> AttentionImpl[T]:
         """build."""

diff --git a/lmdeploy/pytorch/backends/cuda/attention.py b/lmdeploy/pytorch/backends/cuda/attention.py
@@ -62,6 +62,7 @@ def __init__(
         sliding_window: int = None,
         logit_softcapping: float = None,
         causal: bool = True,
+        block_sparse_size: int = 1,
         **kwargs,
     ):
         super().__init__(
@@ -91,6 +92,7 @@ def __init__(
         world_size, rank = get_tp_world_rank()
         self.alibi_head_offset = self.num_heads * rank
         self.alibi_num_heads = self.num_heads * world_size
+        self.block_sparse_size = block_sparse_size
 
     def forward(
         self,
@@ -116,7 +118,7 @@ def forward(
         kv_flatten_size = attn_metadata.kv_flatten_size
         quant_policy = attn_metadata.quant_policy
         if attn_metadata.is_decoding:
-            max_q_seqlen = 1
+            max_q_seqlen = self.block_sparse_size
         else:
             max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
         fill_max_q_seqlen = max_q_seqlen
@@ -213,6 +215,7 @@ def forward(
                 logit_softcapping=self.logit_softcapping,
                 sinks=learnable_sink,
                 causal=self.causal,
+                block_sparse_size=self.block_sparse_size,
             )
 
         return attn_output
@@ -528,9 +531,11 @@ def build(
         causal: bool = True,
         use_flash_mla: bool = False,
         learnable_sink: bool = False,
+        block_sparse_size: int = 1,
         **kwargs,
     ) -> TritonAttentionImpl:
         """build."""
+        enable_fa3 = use_fa3 and not alibi and not learnable_sink and block_sparse_size == 1
         if use_flash_mla is True:
             return FlashMLAImpl(num_heads,
                                 head_size,
@@ -542,7 +547,7 @@ def build(
                                 logical_softcapping=logical_softcapping,
                                 causal=causal,
                                 **kwargs)
-        elif use_fa3 and not alibi and not learnable_sink:
+        elif enable_fa3:
             return FA3Impl(num_heads,
                            head_size,
                            scale=scale,
@@ -563,4 +568,5 @@ def build(
                                        sliding_window=sliding_window,
                                        logical_softcapping=logical_softcapping,
                                        causal=causal,
+                                       block_sparse_size=block_sparse_size,
                                        **kwargs)
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -9,9 +9,11 @@
 from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
 from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
 from lmdeploy.pytorch.models.utils.cudagraph import CudaGraphMeta
+from lmdeploy.pytorch.strategies.base import StrategyFactoryBase
 from lmdeploy.utils import get_logger
 
 from ..graph_runner import GraphRunner
+from .attention import TritonAttentionMetadata
 
 logger = get_logger('lmdeploy')
 
@@ -146,6 +148,11 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
         self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict()
         self.has_try_compile_model: bool = False
 
+        # strategy factory
+        build_ctx = model.ctx_mgr.build_ctx
+        strategy_factory: StrategyFactoryBase = build_ctx.strategy_factory
+        self.cudagraph_strategy = strategy_factory.build_cudagraph_strategy()
+
     def check_enable_graph(self):
         """Check enable graph."""
         if self.backend_config.eager_mode:
@@ -173,18 +180,24 @@ def _get_capture_tokens(self, batch_size: int):
         assert False, f'Unsupported batch_size={batch_size}'
 
     def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List,
-                      attn_metadata: Any, inputs_embeds: torch.Tensor, **kwargs):
+                      attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs):
         """Get graph key."""
         context = self.ctx_mgr.current_context()
         is_decoding = context.is_decoding
-        num_tokens = input_ids.numel()
+        batch_size = attn_metadata.q_seqlens.size(0)
         meta = self.get_meta()
         enable_microbatch = get_step_ctx_manager().current_context().enable_microbatch
         if meta.padding_batch_size is None:
-            new_num_tokens = self._get_capture_tokens(num_tokens)
+            batch_size = self._get_capture_tokens(batch_size)
         else:
-            new_num_tokens = self._get_capture_tokens(meta.padding_batch_size)
-        return (new_num_tokens, is_decoding, enable_microbatch)
+            batch_size = self._get_capture_tokens(meta.padding_batch_size)
+        return (batch_size, is_decoding, enable_microbatch)
+
+    def _get_max_tokens(self, graph_key: tuple):
+        max_batches = graph_key[0]
+        is_decoding = graph_key[1]
+        assert is_decoding
+        return self.cudagraph_strategy.get_max_tokens(max_batches)
 
     def __call__(self, **kwargs):
         """call."""
@@ -198,10 +211,10 @@ def __call__(self, **kwargs):
                 return self.model(**kwargs)
 
         graph_key = self.get_graph_key(**kwargs)
-        max_tokens = graph_key[0]
+        max_batches = graph_key[0]
         is_decoding = graph_key[1]
         if graph_key not in self._runner_map:
-            max_batches = max_tokens if is_decoding else self.max_batches
+            max_tokens = self._get_max_tokens(graph_key)
             runner = CUDASingleGraphRunner(self.model,
                                            max_batches=max_batches,
                                            max_tokens=max_tokens,

diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -200,6 +200,9 @@ class ModelConfig:
     cogvlm_style: bool = False
     custom_module_map: Dict[str, setattr] = None
     use_flash_mla: bool = False
+    model_paradigm: str = 'ar'
+    dllm_mask_token: int = 0
+    dllm_block_length: int = None
 
     def get_head_size(self):
         """Get head size."""
@@ -285,6 +288,14 @@ def from_hf_config(cls,
         return model_config
 
 
+@dataclass
+class DLLMConfig:
+    dllm_block_length: int = 1
+    unmasking_strategy: str = 'low_confidence_dynamic'
+    denoising_steps: int = None
+    confidence_threshold: float = 0.85
+
+
 @dataclass
 class MiscConfig:
     prefill_interval: int = 16
@@ -294,15 +305,21 @@ class MiscConfig:
     hf_overrides: Dict[str, Any] = None
     disable_vision_encoder: bool = False
     logprobs_mode: str = None
+    dllm_config: DLLMConfig = None
 
     @classmethod
     def from_engine_config(cls, engine_config: PytorchEngineConfig):
         """From engine config."""
+        dllm_config = DLLMConfig(dllm_block_length=engine_config.dllm_block_length,
+                                 unmasking_strategy=engine_config.dllm_unmasking_strategy,
+                                 denoising_steps=engine_config.dllm_denoising_steps,
+                                 confidence_threshold=engine_config.dllm_confidence_threshold)
         misc_config = cls(custom_module_map=engine_config.custom_module_map,
                           empty_init=engine_config.empty_init,
                           prefill_interval=engine_config.prefill_interval,
                           model_format=engine_config.model_format,
                           hf_overrides=engine_config.hf_overrides,
                           disable_vision_encoder=engine_config.disable_vision_encoder,
-                          logprobs_mode=engine_config.logprobs_mode)
+                          logprobs_mode=engine_config.logprobs_mode,
+                          dllm_config=dllm_config)
         return misc_config
diff --git a/lmdeploy/pytorch/configurations/sdar.py b/lmdeploy/pytorch/configurations/sdar.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .default import AutoModelConfigBuilder, DefaultModelConfigBuilder
+
+
+class SDARModelConfigBuilder(AutoModelConfigBuilder):
+
+    @classmethod
+    def condition(cls, hf_config):
+        """config."""
+        return hf_config.model_type in ['sdar', 'sdar_moe']
+
+    @classmethod
+    def build(cls, hf_config, model_path: str = None, **kwargs):
+        """build."""
+        cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
+        cfg.dllm_mask_token = 151669
+        cfg.model_paradigm = 'dllm'
+        cfg.dllm_block_length = 4
+        return cfg
diff --git a/lmdeploy/pytorch/consts.py b/lmdeploy/pytorch/consts.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# dllm
+DLLM_MASKED = 0
+DLLM_UNMASKED = 1
+DLLM_CACHED = 2