Skip to content
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
07c667c
refactor SchedulerSequence
grimoire Aug 26, 2025
af01586
block sparse attn
grimoire Aug 26, 2025
e6e440d
Merge branch 'refactor-seqs' into support-SDAR
grimoire Aug 27, 2025
4301864
Merge branch 'block-sparse-attn' into support-SDAR
grimoire Aug 27, 2025
e328c5d
support SDAR
grimoire Sep 1, 2025
63efa34
Merge branch 'main' into support-SDAR
grimoire Sep 1, 2025
48a0137
fix max_new_tokens;update profiler
grimoire Sep 1, 2025
6e8f4c5
add args
grimoire Sep 1, 2025
42f4582
fix multiround stop words
grimoire Sep 1, 2025
9a68f1a
fix sampling step
grimoire Sep 2, 2025
0fa2e7e
optimize position_ids
grimoire Sep 2, 2025
85255d2
fix long context
grimoire Sep 2, 2025
b65afc5
fix vlm
grimoire Sep 2, 2025
da2f403
fix stopping
grimoire Sep 2, 2025
e6b5bdd
move args into logitsprocessor
grimoire Sep 2, 2025
2b0e607
rename
grimoire Sep 3, 2025
f7c7cd8
Merge branch 'main' into support-SDAR
grimoire Sep 3, 2025
a660a43
fix pd
grimoire Sep 3, 2025
b23d962
rename
grimoire Sep 3, 2025
34e41aa
strategy + abstruct factory
grimoire Sep 5, 2025
de49bb5
update seqs
grimoire Sep 5, 2025
3890cfe
add moe support
grimoire Sep 8, 2025
c1e4cde
bind block length
grimoire Sep 8, 2025
d9d688c
solve conflict
grimoire Sep 11, 2025
26f4c2d
fix num loops
grimoire Sep 12, 2025
11674bf
enum unmasking type
grimoire Sep 15, 2025
8fce74a
typo fixing
grimoire Sep 15, 2025
94c3013
warning
grimoire Sep 15, 2025
c74b535
fix metric
grimoire Sep 16, 2025
bbd1489
limit batch size
grimoire Sep 16, 2025
11d3c2e
merge main
grimoire Sep 17, 2025
cc67ff6
merge main
grimoire Sep 18, 2025
e8771be
rename field;comment unmasking strategy
grimoire Sep 18, 2025
59c7c62
suppression warning
grimoire Sep 18, 2025
c0165df
solve conflict
grimoire Sep 18, 2025
1e47c31
colored vis
grimoire Sep 18, 2025
ee71d91
fix dummy
grimoire Sep 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
</ul>
</td>
Expand Down
1 change: 1 addition & 0 deletions README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
</ul>
</td>
Expand Down
8 changes: 8 additions & 0 deletions benchmark/profile_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ def parse_args():
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
ArgumentHelper.dllm_unmasking_strategy(pt_group)
ArgumentHelper.dllm_denoising_steps(pt_group)
ArgumentHelper.dllm_confidence_threshold(pt_group)

tp_act = ArgumentHelper.tp(pt_group)
cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
Expand Down Expand Up @@ -363,6 +367,10 @@ def main():
quant_policy=args.quant_policy,
dtype=args.dtype,
distributed_executor_backend=args.distributed_executor_backend,
dllm_block_length=args.dllm_block_length,
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
dllm_denoising_steps=args.dllm_denoising_steps,
dllm_confidence_threshold=args.dllm_confidence_threshold,
)

if args.use_uvloop:
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |

```{note}
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |

```{note}
* [1] 目前,Mono-InternVL不支持FP16,因为数值不稳定。请改用BF16
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def add_parser_chat():
ArgumentHelper.adapters(pt_group)
ArgumentHelper.device(pt_group)
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
tp_act = ArgumentHelper.tp(pt_group)
Expand Down
8 changes: 8 additions & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def add_parser_api_server():
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.disable_vision_encoder(pt_group)
ArgumentHelper.logprobs_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
ArgumentHelper.dllm_unmasking_strategy(pt_group)
ArgumentHelper.dllm_denoising_steps(pt_group)
ArgumentHelper.dllm_confidence_threshold(pt_group)

# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
Expand Down Expand Up @@ -219,6 +223,10 @@ def api_server(args):
hf_overrides=args.hf_overrides,
disable_vision_encoder=args.disable_vision_encoder,
logprobs_mode=args.logprobs_mode,
dllm_block_length=args.dllm_block_length,
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
dllm_denoising_steps=args.dllm_denoising_steps,
dllm_confidence_threshold=args.dllm_confidence_threshold,
)
else:
from lmdeploy.messages import TurbomindEngineConfig
Expand Down
30 changes: 30 additions & 0 deletions lmdeploy/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,36 @@ def logprobs_mode(parser):
choices=[None, 'raw_logits', 'raw_logprobs'],
help='The mode of logprobs.')

@staticmethod
def dllm_block_length(parser):
"""dllm_block_length for dllm."""
return parser.add_argument('--dllm-block-length', type=int, default=None, help='Block length for dllm')

@staticmethod
def dllm_unmasking_strategy(parser):
"""Dllm unmasking strategy."""
return parser.add_argument('--dllm-unmasking-strategy',
type=str,
default='low_confidence_dynamic',
choices=['low_confidence_dynamic', 'low_confidence_static', 'sequential'],
help='The unmasking strategy for dllm.')

@staticmethod
def dllm_denoising_steps(parser):
"""Dllm denoising steps."""
return parser.add_argument('--dllm-denoising-steps',
type=int,
default=None,
help='The number of denoising steps for dllm.')

@staticmethod
def dllm_confidence_threshold(parser):
"""Dllm confidence threshold."""
return parser.add_argument('--dllm-confidence-threshold',
type=float,
default=0.85,
help='The confidence threshold for dllm.')


# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
class FlexibleArgumentParser(argparse.ArgumentParser):
Expand Down
12 changes: 12 additions & 0 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,12 @@ class PytorchEngineConfig:
disable_vision_encoder (bool): Whether to disable loading vision
encoder. Default to False.
logprobs_mode (str): The mode of logprob, options: ['raw_logits', 'raw_logprobs']
dllm_block_length (int): Block size of block diffusion model.
dllm_unmasking_strategy (str): Dllm unmasking strategy, options:
['low_confidence_dynamic', 'low_confidence_static', 'sequential'].
dllm_denoising_steps (int): Dllm denoising steps.
dllm_confidence_threshold (float): dllm unmasking threshold for
dynamic unmasking.
"""
dtype: str = 'auto'
tp: int = 1
Expand Down Expand Up @@ -370,6 +376,12 @@ class PytorchEngineConfig:
disable_vision_encoder: bool = False
logprobs_mode: str = None

# dllm
dllm_block_length: int = None
dllm_unmasking_strategy: str = 'low_confidence_dynamic'
dllm_denoising_steps: int = None
dllm_confidence_threshold: float = 0.85

role: EngineRole = EngineRole.Hybrid
migration_backend: MigrationBackend = MigrationBackend.DLSlime

Expand Down
1 change: 1 addition & 0 deletions lmdeploy/pytorch/backends/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def build(
causal: bool = True,
use_flash_mla: bool = False,
learnable_sink: bool = False,
block_sparse_size: int = 1,
**kwargs,
) -> AttentionImpl[T]:
"""build."""
Expand Down
10 changes: 8 additions & 2 deletions lmdeploy/pytorch/backends/cuda/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(
sliding_window: int = None,
logit_softcapping: float = None,
causal: bool = True,
block_sparse_size: int = 1,
**kwargs,
):
super().__init__(
Expand Down Expand Up @@ -91,6 +92,7 @@ def __init__(
world_size, rank = get_tp_world_rank()
self.alibi_head_offset = self.num_heads * rank
self.alibi_num_heads = self.num_heads * world_size
self.block_sparse_size = block_sparse_size

def forward(
self,
Expand All @@ -116,7 +118,7 @@ def forward(
kv_flatten_size = attn_metadata.kv_flatten_size
quant_policy = attn_metadata.quant_policy
if attn_metadata.is_decoding:
max_q_seqlen = 1
max_q_seqlen = self.block_sparse_size
else:
max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
fill_max_q_seqlen = max_q_seqlen
Expand Down Expand Up @@ -213,6 +215,7 @@ def forward(
logit_softcapping=self.logit_softcapping,
sinks=learnable_sink,
causal=self.causal,
block_sparse_size=self.block_sparse_size,
)

return attn_output
Expand Down Expand Up @@ -528,9 +531,11 @@ def build(
causal: bool = True,
use_flash_mla: bool = False,
learnable_sink: bool = False,
block_sparse_size: int = 1,
**kwargs,
) -> TritonAttentionImpl:
"""build."""
enable_fa3 = use_fa3 and not alibi and not learnable_sink and block_sparse_size == 1
if use_flash_mla is True:
return FlashMLAImpl(num_heads,
head_size,
Expand All @@ -542,7 +547,7 @@ def build(
logical_softcapping=logical_softcapping,
causal=causal,
**kwargs)
elif use_fa3 and not alibi and not learnable_sink:
elif enable_fa3:
return FA3Impl(num_heads,
head_size,
scale=scale,
Expand All @@ -563,4 +568,5 @@ def build(
sliding_window=sliding_window,
logical_softcapping=logical_softcapping,
causal=causal,
block_sparse_size=block_sparse_size,
**kwargs)
27 changes: 20 additions & 7 deletions lmdeploy/pytorch/backends/cuda/graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
from lmdeploy.pytorch.models.utils.cudagraph import CudaGraphMeta
from lmdeploy.pytorch.strategies.base import StrategyFactoryBase
from lmdeploy.utils import get_logger

from ..graph_runner import GraphRunner
from .attention import TritonAttentionMetadata

logger = get_logger('lmdeploy')

Expand Down Expand Up @@ -146,6 +148,11 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict()
self.has_try_compile_model: bool = False

# strategy factory
build_ctx = model.ctx_mgr.build_ctx
strategy_factory: StrategyFactoryBase = build_ctx.strategy_factory
self.cudagraph_strategy = strategy_factory.build_cudagraph_strategy()

def check_enable_graph(self):
"""Check enable graph."""
if self.backend_config.eager_mode:
Expand Down Expand Up @@ -173,18 +180,24 @@ def _get_capture_tokens(self, batch_size: int):
assert False, f'Unsupported batch_size={batch_size}'

def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List,
attn_metadata: Any, inputs_embeds: torch.Tensor, **kwargs):
attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs):
"""Get graph key."""
context = self.ctx_mgr.current_context()
is_decoding = context.is_decoding
num_tokens = input_ids.numel()
batch_size = attn_metadata.q_seqlens.size(0)
meta = self.get_meta()
enable_microbatch = get_step_ctx_manager().current_context().enable_microbatch
if meta.padding_batch_size is None:
new_num_tokens = self._get_capture_tokens(num_tokens)
batch_size = self._get_capture_tokens(batch_size)
else:
new_num_tokens = self._get_capture_tokens(meta.padding_batch_size)
return (new_num_tokens, is_decoding, enable_microbatch)
batch_size = self._get_capture_tokens(meta.padding_batch_size)
return (batch_size, is_decoding, enable_microbatch)

def _get_max_tokens(self, graph_key: tuple):
max_batches = graph_key[0]
is_decoding = graph_key[1]
assert is_decoding
return self.cudagraph_strategy.get_max_tokens(max_batches)

def __call__(self, **kwargs):
"""call."""
Expand All @@ -198,10 +211,10 @@ def __call__(self, **kwargs):
return self.model(**kwargs)

graph_key = self.get_graph_key(**kwargs)
max_tokens = graph_key[0]
max_batches = graph_key[0]
is_decoding = graph_key[1]
if graph_key not in self._runner_map:
max_batches = max_tokens if is_decoding else self.max_batches
max_tokens = self._get_max_tokens(graph_key)
runner = CUDASingleGraphRunner(self.model,
max_batches=max_batches,
max_tokens=max_tokens,
Expand Down
19 changes: 18 additions & 1 deletion lmdeploy/pytorch/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,9 @@ class ModelConfig:
cogvlm_style: bool = False
custom_module_map: Dict[str, setattr] = None
use_flash_mla: bool = False
model_paradigm: str = 'ar'
dllm_mask_token: int = 0
dllm_block_length: int = None

def get_head_size(self):
"""Get head size."""
Expand Down Expand Up @@ -285,6 +288,14 @@ def from_hf_config(cls,
return model_config


@dataclass
class DLLMConfig:
dllm_block_length: int = 1
unmasking_strategy: str = 'low_confidence_dynamic'
denoising_steps: int = None
confidence_threshold: float = 0.85


@dataclass
class MiscConfig:
prefill_interval: int = 16
Expand All @@ -294,15 +305,21 @@ class MiscConfig:
hf_overrides: Dict[str, Any] = None
disable_vision_encoder: bool = False
logprobs_mode: str = None
dllm_config: DLLMConfig = None

@classmethod
def from_engine_config(cls, engine_config: PytorchEngineConfig):
"""From engine config."""
dllm_config = DLLMConfig(dllm_block_length=engine_config.dllm_block_length,
unmasking_strategy=engine_config.dllm_unmasking_strategy,
denoising_steps=engine_config.dllm_denoising_steps,
confidence_threshold=engine_config.dllm_confidence_threshold)
misc_config = cls(custom_module_map=engine_config.custom_module_map,
empty_init=engine_config.empty_init,
prefill_interval=engine_config.prefill_interval,
model_format=engine_config.model_format,
hf_overrides=engine_config.hf_overrides,
disable_vision_encoder=engine_config.disable_vision_encoder,
logprobs_mode=engine_config.logprobs_mode)
logprobs_mode=engine_config.logprobs_mode,
dllm_config=dllm_config)
return misc_config
19 changes: 19 additions & 0 deletions lmdeploy/pytorch/configurations/sdar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (c) OpenMMLab. All rights reserved.
from .default import AutoModelConfigBuilder, DefaultModelConfigBuilder


class SDARModelConfigBuilder(AutoModelConfigBuilder):

@classmethod
def condition(cls, hf_config):
"""config."""
return hf_config.model_type in ['sdar', 'sdar_moe']

@classmethod
def build(cls, hf_config, model_path: str = None, **kwargs):
"""build."""
cfg = DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
cfg.dllm_mask_token = 151669
cfg.model_paradigm = 'dllm'
cfg.dllm_block_length = 4
return cfg
5 changes: 5 additions & 0 deletions lmdeploy/pytorch/consts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) OpenMMLab. All rights reserved.
# dllm
DLLM_MASKED = 0
DLLM_UNMASKED = 1
DLLM_CACHED = 2
Loading