Skip to content
Merged
Show file tree
Hide file tree
Changes from 35 commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
07c667c
refactor SchedulerSequence
grimoire Aug 26, 2025
af01586
block sparse attn
grimoire Aug 26, 2025
e6e440d
Merge branch 'refactor-seqs' into support-SDAR
grimoire Aug 27, 2025
4301864
Merge branch 'block-sparse-attn' into support-SDAR
grimoire Aug 27, 2025
e328c5d
support SDAR
grimoire Sep 1, 2025
63efa34
Merge branch 'main' into support-SDAR
grimoire Sep 1, 2025
48a0137
fix max_new_tokens;update profiler
grimoire Sep 1, 2025
6e8f4c5
add args
grimoire Sep 1, 2025
42f4582
fix multiround stop words
grimoire Sep 1, 2025
9a68f1a
fix sampling step
grimoire Sep 2, 2025
0fa2e7e
optimize position_ids
grimoire Sep 2, 2025
85255d2
fix long context
grimoire Sep 2, 2025
b65afc5
fix vlm
grimoire Sep 2, 2025
da2f403
fix stopping
grimoire Sep 2, 2025
e6b5bdd
move args into logitsprocessor
grimoire Sep 2, 2025
2b0e607
rename
grimoire Sep 3, 2025
f7c7cd8
Merge branch 'main' into support-SDAR
grimoire Sep 3, 2025
a660a43
fix pd
grimoire Sep 3, 2025
b23d962
rename
grimoire Sep 3, 2025
34e41aa
strategy + abstruct factory
grimoire Sep 5, 2025
de49bb5
update seqs
grimoire Sep 5, 2025
3890cfe
add moe support
grimoire Sep 8, 2025
c1e4cde
bind block length
grimoire Sep 8, 2025
d9d688c
solve conflict
grimoire Sep 11, 2025
26f4c2d
fix num loops
grimoire Sep 12, 2025
11674bf
enum unmasking type
grimoire Sep 15, 2025
8fce74a
typo fixing
grimoire Sep 15, 2025
94c3013
warning
grimoire Sep 15, 2025
c74b535
fix metric
grimoire Sep 16, 2025
bbd1489
limit batch size
grimoire Sep 16, 2025
11d3c2e
merge main
grimoire Sep 17, 2025
cc67ff6
merge main
grimoire Sep 18, 2025
e8771be
rename field;comment unmasking strategy
grimoire Sep 18, 2025
59c7c62
suppression warning
grimoire Sep 18, 2025
c0165df
solve conflict
grimoire Sep 18, 2025
1e47c31
colored vis
grimoire Sep 18, 2025
ee71d91
fix dummy
grimoire Sep 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
</ul>
</td>
Expand Down
1 change: 1 addition & 0 deletions README_ja.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
</ul>
</td>
<td>
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
<li>Phi-3.5-MoE (16x3.8B)</li>
<li>Phi-4-mini (3.8B)</li>
<li>MiniCPM3 (4B)</li>
<li>SDAR (1.7B-30B)</li>
<li>gpt-oss (20B, 120B)</li>
</ul>
</td>
Expand Down
8 changes: 8 additions & 0 deletions benchmark/profile_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ def parse_args():
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
ArgumentHelper.dllm_unmasking_strategy(pt_group)
ArgumentHelper.dllm_denoising_steps(pt_group)
ArgumentHelper.dllm_confidence_threshold(pt_group)

tp_act = ArgumentHelper.tp(pt_group)
cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
Expand Down Expand Up @@ -363,6 +367,10 @@ def main():
quant_policy=args.quant_policy,
dtype=args.dtype,
distributed_executor_backend=args.distributed_executor_backend,
dllm_block_length=args.dllm_block_length,
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
dllm_denoising_steps=args.dllm_denoising_steps,
dllm_confidence_threshold=args.dllm_confidence_threshold,
)

if args.use_uvloop:
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |

```{note}
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.
Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |

```{note}
* [1] 目前,Mono-InternVL不支持FP16,因为数值不稳定。请改用BF16
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def add_parser_chat():
ArgumentHelper.adapters(pt_group)
ArgumentHelper.device(pt_group)
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
tp_act = ArgumentHelper.tp(pt_group)
Expand Down
8 changes: 8 additions & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ def add_parser_api_server():
ArgumentHelper.eager_mode(pt_group)
ArgumentHelper.disable_vision_encoder(pt_group)
ArgumentHelper.logprobs_mode(pt_group)
ArgumentHelper.dllm_block_length(pt_group)
ArgumentHelper.dllm_unmasking_strategy(pt_group)
ArgumentHelper.dllm_denoising_steps(pt_group)
ArgumentHelper.dllm_confidence_threshold(pt_group)

# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
Expand Down Expand Up @@ -219,6 +223,10 @@ def api_server(args):
hf_overrides=args.hf_overrides,
disable_vision_encoder=args.disable_vision_encoder,
logprobs_mode=args.logprobs_mode,
dllm_block_length=args.dllm_block_length,
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
dllm_denoising_steps=args.dllm_denoising_steps,
dllm_confidence_threshold=args.dllm_confidence_threshold,
)
else:
from lmdeploy.messages import TurbomindEngineConfig
Expand Down
30 changes: 30 additions & 0 deletions lmdeploy/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,36 @@ def logprobs_mode(parser):
choices=[None, 'raw_logits', 'raw_logprobs'],
help='The mode of logprobs.')

@staticmethod
def dllm_block_length(parser):
"""dllm_block_length for dllm."""
return parser.add_argument('--dllm-block-length', type=int, default=None, help='Block length for dllm')

@staticmethod
def dllm_unmasking_strategy(parser):
"""Dllm unmasking strategy."""
return parser.add_argument('--dllm-unmasking-strategy',
type=str,
default='low_confidence_dynamic',
choices=['low_confidence_dynamic', 'low_confidence_static', 'sequential'],
help='The unmasking strategy for dllm.')

@staticmethod
def dllm_denoising_steps(parser):
"""Dllm denoising steps."""
return parser.add_argument('--dllm-denoising-steps',
type=int,
default=None,
help='The number of denoising steps for dllm.')

@staticmethod
def dllm_confidence_threshold(parser):
"""Dllm confidence threshold."""
return parser.add_argument('--dllm-confidence-threshold',
type=float,
default=0.85,
help='The confidence threshold for dllm.')


# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
class FlexibleArgumentParser(argparse.ArgumentParser):
Expand Down
12 changes: 12 additions & 0 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,12 @@ class PytorchEngineConfig:
disable_vision_encoder (bool): Whether to disable loading vision
encoder. Default to False.
logprobs_mode (str): The mode of logprob, options: ['raw_logits', 'raw_logprobs']
dllm_block_length (int): Block size of block diffusion model.
dllm_unmasking_strategy (str): Dllm unmasking strategy, options:
['low_confidence_dynamic', 'low_confidence_static', 'sequential'].
dllm_denoising_steps (int): Dllm denoising steps.
dllm_confidence_threshold (float): dllm unmasking threshold for
dynamic unmasking.
"""
dtype: str = 'auto'
tp: int = 1
Expand Down Expand Up @@ -370,6 +376,12 @@ class PytorchEngineConfig:
disable_vision_encoder: bool = False
logprobs_mode: str = None

# dllm
dllm_block_length: int = None
dllm_unmasking_strategy: str = 'low_confidence_dynamic'
dllm_denoising_steps: int = None
dllm_confidence_threshold: float = 0.85

role: EngineRole = EngineRole.Hybrid
migration_backend: MigrationBackend = MigrationBackend.DLSlime

Expand Down
5 changes: 4 additions & 1 deletion lmdeploy/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,10 @@ def update_from_output(self, outputs: EngineOutput, req_state: RequestState):
outputs (EngineOutput): The output from the engine containing information about the current iteration.
req_state (RequestState): The state of the request, including timestamps and token counts.
"""
self.new_generation_tokens = outputs.num_token - req_state.generation_tokens
new_generation_tokens = outputs.num_token - req_state.generation_tokens
if new_generation_tokens == 0:
return
self.new_generation_tokens = new_generation_tokens
if req_state.first_token_time == 0:
# It means the first token is generated in this iteration
req_state.first_token_time = outputs.req_metrics.token_timestamp
Expand Down
7 changes: 5 additions & 2 deletions lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ class HFChatTemplate(BaseChatTemplate):

def __init__(self, model_path: str = '', **kwargs):
try:
from transformers import AutoTokenizer, PretrainedConfig
from transformers import AutoConfig, AutoTokenizer, PretrainedConfig
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.system_start, self.system_end = self._role_instruction('system')
self.user_start, self.user_end = self._role_instruction('user')
Expand All @@ -747,7 +747,10 @@ def __init__(self, model_path: str = '', **kwargs):
self.stop_words.append(self.tokenizer.eos_token)
if hasattr(self.tokenizer, 'eot_token') and self.tokenizer.eot_token is not None:
self.stop_words.append(self.tokenizer.eot_token)
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
try:
cfg = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
except Exception as e: # noqa
cfg = PretrainedConfig.from_pretrained(model_path, trust_remote_code=True)
self.is_gpt_oss = getattr(cfg, 'architectures', [''])[0] == 'GptOssForCausalLM'
if self.is_gpt_oss:
self.stop_words.append('<|call|>')
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/pytorch/backends/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ def build(
causal: bool = True,
use_flash_mla: bool = False,
learnable_sink: bool = False,
block_sparse_size: int = 1,
**kwargs,
) -> AttentionImpl[T]:
"""build."""
Expand Down
36 changes: 32 additions & 4 deletions lmdeploy/pytorch/backends/cuda/attention.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) OpenMMLab. All rights reserved.

import functools
from dataclasses import dataclass
from typing import Literal

Expand All @@ -20,8 +21,8 @@
assert torch.ops.flash_attn_3 is not None
use_fa3 = True
except Exception:
logger.warning('For higher performance, please install FlashAttention-3 '
'https://github.com/Dao-AILab/flash-attention')
logger.debug('For higher performance, please install FlashAttention-3 '
'https://github.com/Dao-AILab/flash-attention')


@dataclass
Expand Down Expand Up @@ -62,6 +63,7 @@ def __init__(
sliding_window: int = None,
logit_softcapping: float = None,
causal: bool = True,
block_sparse_size: int = 1,
**kwargs,
):
super().__init__(
Expand Down Expand Up @@ -91,6 +93,7 @@ def __init__(
world_size, rank = get_tp_world_rank()
self.alibi_head_offset = self.num_heads * rank
self.alibi_num_heads = self.num_heads * world_size
self.block_sparse_size = block_sparse_size

def forward(
self,
Expand All @@ -116,7 +119,7 @@ def forward(
kv_flatten_size = attn_metadata.kv_flatten_size
quant_policy = attn_metadata.quant_policy
if attn_metadata.is_decoding:
max_q_seqlen = 1
max_q_seqlen = self.block_sparse_size
else:
max_q_seqlen = query.numel() // (query.size(-1) * query.size(-2))
fill_max_q_seqlen = max_q_seqlen
Expand Down Expand Up @@ -213,11 +216,21 @@ def forward(
logit_softcapping=self.logit_softcapping,
sinks=learnable_sink,
causal=self.causal,
block_sparse_size=self.block_sparse_size,
)

return attn_output


@functools.lru_cache
def use_fa3_warning():
if use_fa3:
return True
logger.warning('For higher performance, please install FlashAttention-3 '
'https://github.com/Dao-AILab/flash-attention')
return False


class FlashMLAImpl(TritonAttentionImpl):

def __init__(
Expand Down Expand Up @@ -252,6 +265,7 @@ def __init__(
from lmdeploy.pytorch.kernels.cuda import flash_mla_fwd
self.flash_mla_fwd = flash_mla_fwd
assert num_kv_heads == 1, 'MLA requires num kv heads equal to 1'
use_fa3_warning()

def forward(
self,
Expand Down Expand Up @@ -512,6 +526,14 @@ def forward(
return attn_output


@functools.lru_cache
def _enable_fa3(alibi: bool, learnable_sink: bool, block_sparse_size: int):
enable = not alibi and not learnable_sink and block_sparse_size == 1
if enable and not use_fa3_warning():
enable = False
return enable


class TritonAttentionBuilder(AttentionBuilder[TritonAttentionMetadata]):
"""Triton attention builder."""

Expand All @@ -528,10 +550,13 @@ def build(
causal: bool = True,
use_flash_mla: bool = False,
learnable_sink: bool = False,
block_sparse_size: int = 1,
**kwargs,
) -> TritonAttentionImpl:
"""build."""
enable_fa3 = _enable_fa3(alibi, learnable_sink, block_sparse_size)
if use_flash_mla is True:
logger.debug('Build FlashMLAImpl Attention')
return FlashMLAImpl(num_heads,
head_size,
scale=scale,
Expand All @@ -542,7 +567,8 @@ def build(
logical_softcapping=logical_softcapping,
causal=causal,
**kwargs)
elif use_fa3 and not alibi and not learnable_sink:
elif enable_fa3:
logger.debug('Build FA3Impl Attention')
return FA3Impl(num_heads,
head_size,
scale=scale,
Expand All @@ -554,6 +580,7 @@ def build(
causal=causal,
**kwargs)
else:
logger.debug('Build TritonAttentionImpl Attention')
return TritonAttentionImpl(num_heads,
head_size,
scale=scale,
Expand All @@ -563,4 +590,5 @@ def build(
sliding_window=sliding_window,
logical_softcapping=logical_softcapping,
causal=causal,
block_sparse_size=block_sparse_size,
**kwargs)
27 changes: 20 additions & 7 deletions lmdeploy/pytorch/backends/cuda/graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from lmdeploy.pytorch.config import BackendConfig, CacheConfig, ModelConfig
from lmdeploy.pytorch.model_inputs import StepContext, get_step_ctx_manager
from lmdeploy.pytorch.models.utils.cudagraph import CudaGraphMeta
from lmdeploy.pytorch.strategies.base import StrategyFactoryBase
from lmdeploy.utils import get_logger

from ..graph_runner import GraphRunner
from .attention import TritonAttentionMetadata

logger = get_logger('lmdeploy')

Expand Down Expand Up @@ -146,6 +148,11 @@ def __init__(self, model: torch.nn.Module, model_config: ModelConfig, cache_conf
self._runner_map: Dict[Any, CUDASingleGraphRunner] = dict()
self.has_try_compile_model: bool = False

# strategy factory
build_ctx = model.ctx_mgr.build_ctx
strategy_factory: StrategyFactoryBase = build_ctx.strategy_factory
self.cudagraph_strategy = strategy_factory.build_cudagraph_strategy()

def check_enable_graph(self):
"""Check enable graph."""
if self.backend_config.eager_mode:
Expand Down Expand Up @@ -173,18 +180,24 @@ def _get_capture_tokens(self, batch_size: int):
assert False, f'Unsupported batch_size={batch_size}'

def get_graph_key(self, input_ids: torch.Tensor, position_ids: torch.Tensor, past_key_values: List,
attn_metadata: Any, inputs_embeds: torch.Tensor, **kwargs):
attn_metadata: TritonAttentionMetadata, inputs_embeds: torch.Tensor, **kwargs):
"""Get graph key."""
context = self.ctx_mgr.current_context()
is_decoding = context.is_decoding
num_tokens = input_ids.numel()
batch_size = attn_metadata.q_seqlens.size(0)
meta = self.get_meta()
enable_microbatch = get_step_ctx_manager().current_context().enable_microbatch
if meta.padding_batch_size is None:
new_num_tokens = self._get_capture_tokens(num_tokens)
batch_size = self._get_capture_tokens(batch_size)
else:
new_num_tokens = self._get_capture_tokens(meta.padding_batch_size)
return (new_num_tokens, is_decoding, enable_microbatch)
batch_size = self._get_capture_tokens(meta.padding_batch_size)
return (batch_size, is_decoding, enable_microbatch)

def _get_max_tokens(self, graph_key: tuple):
max_batches = graph_key[0]
is_decoding = graph_key[1]
assert is_decoding
return self.cudagraph_strategy.get_max_tokens(max_batches)

def __call__(self, **kwargs):
"""call."""
Expand All @@ -198,10 +211,10 @@ def __call__(self, **kwargs):
return self.model(**kwargs)

graph_key = self.get_graph_key(**kwargs)
max_tokens = graph_key[0]
max_batches = graph_key[0]
is_decoding = graph_key[1]
if graph_key not in self._runner_map:
max_batches = max_tokens if is_decoding else self.max_batches
max_tokens = self._get_max_tokens(graph_key)
runner = CUDASingleGraphRunner(self.model,
max_batches=max_batches,
max_tokens=max_tokens,
Expand Down
2 changes: 1 addition & 1 deletion lmdeploy/pytorch/check_env/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from .base import BaseChecker

MIN_TRANSFORMERS_VERSION = '4.33.0'
MAX_TRANSFORMERS_VERSION = '4.53.3'
MAX_TRANSFORMERS_VERSION = '4.56.1'


class TransformersChecker(BaseChecker):
Expand Down
Loading
Loading