Skip to content

Commit a96391b

Browse files
authored
Support SDAR (#3922)
* refactor SchedulerSequence * block sparse attn * support SDAR * fix max_new_tokens;update profiler * add args * fix multiround stop words * fix sampling step * optimize position_ids * fix long context * fix vlm * fix stopping * move args into logitsprocessor * rename * fix pd * rename * strategy + abstruct factory * update seqs * add moe support * bind block length * fix num loops * enum unmasking type * typo fixing * warning * fix metric * limit batch size * rename field;comment unmasking strategy * suppression warning * colored vis * fix dummy
1 parent 85f4150 commit a96391b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+3444
-665
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ LMDeploy is a toolkit for compressing, deploying, and serving LLM, developed by
150150
<li>Phi-3.5-MoE (16x3.8B)</li>
151151
<li>Phi-4-mini (3.8B)</li>
152152
<li>MiniCPM3 (4B)</li>
153+
<li>SDAR (1.7B-30B)</li>
153154
<li>gpt-oss (20B, 120B)</li>
154155
</ul>
155156
</td>

README_ja.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,7 @@ LMDeploy TurboMindエンジンは卓越した推論能力を持ち、さまざ
137137
<li>Phi-3.5-MoE (16x3.8B)</li>
138138
<li>Phi-4-mini (3.8B)</li>
139139
<li>MiniCPM3 (4B)</li>
140+
<li>SDAR (1.7B-30B)</li>
140141
</ul>
141142
</td>
142143
<td>

README_zh-CN.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
151151
<li>Phi-3.5-MoE (16x3.8B)</li>
152152
<li>Phi-4-mini (3.8B)</li>
153153
<li>MiniCPM3 (4B)</li>
154+
<li>SDAR (1.7B-30B)</li>
154155
<li>gpt-oss (20B, 120B)</li>
155156
</ul>
156157
</td>

benchmark/profile_throughput.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,10 @@ def parse_args():
307307
# pytorch engine args
308308
pt_group = parser.add_argument_group('PyTorch engine arguments')
309309
ArgumentHelper.eager_mode(pt_group)
310+
ArgumentHelper.dllm_block_length(pt_group)
311+
ArgumentHelper.dllm_unmasking_strategy(pt_group)
312+
ArgumentHelper.dllm_denoising_steps(pt_group)
313+
ArgumentHelper.dllm_confidence_threshold(pt_group)
310314

311315
tp_act = ArgumentHelper.tp(pt_group)
312316
cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -363,6 +367,10 @@ def main():
363367
quant_policy=args.quant_policy,
364368
dtype=args.dtype,
365369
distributed_executor_backend=args.distributed_executor_backend,
370+
dllm_block_length=args.dllm_block_length,
371+
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
372+
dllm_denoising_steps=args.dllm_denoising_steps,
373+
dllm_confidence_threshold=args.dllm_confidence_threshold,
366374
)
367375

368376
if args.use_uvloop:

docs/en/supported_models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ The following tables detail the models supported by LMDeploy's TurboMind engine
120120
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
121121
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
122122
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
123+
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |
123124

124125
```{note}
125126
* [1] Currently Mono-InternVL does not support FP16 due to numerical instability. Please use BF16 instead.

docs/zh_cn/supported_models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
| Phi-3.5-mini | 3.8B | LLM | Yes | Yes | No | - | - |
121121
| Phi-3.5-MoE | 16x3.8B | LLM | Yes | Yes | No | - | - |
122122
| Phi-3.5-vision | 4.2B | MLLM | Yes | Yes | No | - | - |
123+
| SDAR | 1.7B-30B | LLM | Yes | Yes | No | - | - |
123124

124125
```{note}
125126
* [1] 目前,Mono-InternVL不支持FP16,因为数值不稳定。请改用BF16

lmdeploy/cli/cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ def add_parser_chat():
5555
ArgumentHelper.adapters(pt_group)
5656
ArgumentHelper.device(pt_group)
5757
ArgumentHelper.eager_mode(pt_group)
58+
ArgumentHelper.dllm_block_length(pt_group)
5859
# common engine args
5960
dtype_act = ArgumentHelper.dtype(pt_group)
6061
tp_act = ArgumentHelper.tp(pt_group)

lmdeploy/cli/serve.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ def add_parser_api_server():
9292
ArgumentHelper.eager_mode(pt_group)
9393
ArgumentHelper.disable_vision_encoder(pt_group)
9494
ArgumentHelper.logprobs_mode(pt_group)
95+
ArgumentHelper.dllm_block_length(pt_group)
96+
ArgumentHelper.dllm_unmasking_strategy(pt_group)
97+
ArgumentHelper.dllm_denoising_steps(pt_group)
98+
ArgumentHelper.dllm_confidence_threshold(pt_group)
9599

96100
# common engine args
97101
dtype_act = ArgumentHelper.dtype(pt_group)
@@ -219,6 +223,10 @@ def api_server(args):
219223
hf_overrides=args.hf_overrides,
220224
disable_vision_encoder=args.disable_vision_encoder,
221225
logprobs_mode=args.logprobs_mode,
226+
dllm_block_length=args.dllm_block_length,
227+
dllm_unmasking_strategy=args.dllm_unmasking_strategy,
228+
dllm_denoising_steps=args.dllm_denoising_steps,
229+
dllm_confidence_threshold=args.dllm_confidence_threshold,
222230
)
223231
else:
224232
from lmdeploy.messages import TurbomindEngineConfig

lmdeploy/cli/utils.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,36 @@ def logprobs_mode(parser):
624624
choices=[None, 'raw_logits', 'raw_logprobs'],
625625
help='The mode of logprobs.')
626626

627+
@staticmethod
628+
def dllm_block_length(parser):
629+
"""dllm_block_length for dllm."""
630+
return parser.add_argument('--dllm-block-length', type=int, default=None, help='Block length for dllm')
631+
632+
@staticmethod
633+
def dllm_unmasking_strategy(parser):
634+
"""Dllm unmasking strategy."""
635+
return parser.add_argument('--dllm-unmasking-strategy',
636+
type=str,
637+
default='low_confidence_dynamic',
638+
choices=['low_confidence_dynamic', 'low_confidence_static', 'sequential'],
639+
help='The unmasking strategy for dllm.')
640+
641+
@staticmethod
642+
def dllm_denoising_steps(parser):
643+
"""Dllm denoising steps."""
644+
return parser.add_argument('--dllm-denoising-steps',
645+
type=int,
646+
default=None,
647+
help='The number of denoising steps for dllm.')
648+
649+
@staticmethod
650+
def dllm_confidence_threshold(parser):
651+
"""Dllm confidence threshold."""
652+
return parser.add_argument('--dllm-confidence-threshold',
653+
type=float,
654+
default=0.85,
655+
help='The confidence threshold for dllm.')
656+
627657

628658
# adapted from https://github.com/vllm-project/vllm/blob/main/vllm/utils/__init__.py
629659
class FlexibleArgumentParser(argparse.ArgumentParser):

lmdeploy/messages.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,12 @@ class PytorchEngineConfig:
336336
disable_vision_encoder (bool): Whether to disable loading vision
337337
encoder. Default to False.
338338
logprobs_mode (str): The mode of logprob, options: ['raw_logits', 'raw_logprobs']
339+
dllm_block_length (int): Block size of block diffusion model.
340+
dllm_unmasking_strategy (str): Dllm unmasking strategy, options:
341+
['low_confidence_dynamic', 'low_confidence_static', 'sequential'].
342+
dllm_denoising_steps (int): Dllm denoising steps.
343+
dllm_confidence_threshold (float): dllm unmasking threshold for
344+
dynamic unmasking.
339345
"""
340346
dtype: str = 'auto'
341347
tp: int = 1
@@ -371,6 +377,12 @@ class PytorchEngineConfig:
371377
disable_vision_encoder: bool = False
372378
logprobs_mode: str = None
373379

380+
# dllm
381+
dllm_block_length: int = None
382+
dllm_unmasking_strategy: str = 'low_confidence_dynamic'
383+
dllm_denoising_steps: int = None
384+
dllm_confidence_threshold: float = 0.85
385+
374386
role: EngineRole = EngineRole.Hybrid
375387
migration_backend: MigrationBackend = MigrationBackend.DLSlime
376388

0 commit comments

Comments
 (0)