Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,30 @@ def _handle_model_specific_adjustments(self):
# use bf16 for mxfp4 triton kernels
self.dtype = "bfloat16"

elif model_arch == "DeepseekV3ForCausalLM":
# Enable optimizations for DeepSeek V3 on Blackwell
if is_sm100_supported():
# Set attention backend to trtllm_mla if not already set
if self.attention_backend is None:
self.attention_backend = "trtllm_mla"
logger.info(
f"Set attention backend to trtllm_mla on sm100 for {model_arch}"
)

# Enable FlashInfer TRTLLM MoE
if not self.enable_flashinfer_trtllm_moe:
self.enable_flashinfer_trtllm_moe = True
logger.info(
f"Enable FlashInfer TRTLLM MoE on sm100 for {model_arch}"
)
Comment on lines +789 to +793
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The code attempts to use self.enable_flashinfer_trtllm_moe, which is not an attribute of the ServerArgs class. This will cause an AttributeError at runtime.

Additionally, the --enable-flashinfer-trtllm-moe command-line argument is deprecated. The recommended way to enable this feature is by setting moe_runner_backend to 'flashinfer_trtllm'. The help message for the deprecated argument states: NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set --moe-runner-backend to 'flashinfer_trtllm' instead.

The suggested change fixes the error and uses the current recommended approach. It also checks if a user has already specified a moe_runner_backend to avoid overriding their choice, which is consistent with how attention_backend is handled.

if self.moe_runner_backend == "auto":
    self.moe_runner_backend = "flashinfer_trtllm"
    logger.info(
        f"Set moe_runner_backend to 'flashinfer_trtllm' on sm100 for {model_arch}"
    )


# Enable FlashInfer AllReduce Fusion
if not self.enable_dp_attention:
self.enable_flashinfer_allreduce_fusion = True
logger.info(
f"Enable FlashInfer AllReduce Fusion on sm100 for {model_arch}"
)

elif "Llama4" in model_arch and self.device != "cpu":
assert self.attention_backend in {
"fa3",
Expand Down
Loading