Fix the Eagle3 inference failure issue. 1、Hardcoding version 8.3 for judgment is not conducive to maintenance. It is recommended to define the version condition as a constant 2、Magic number: The value 0 for max_seq_len should be defined as a constant

sunchendd · sunchendd · commit e7957f413a82 · 2025-12-04T10:54:53.000+08:00
Signed-off-by:sunchendd &lt;sunchendong@xfusion.com&gt;
diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py
@@ -15,6 +15,10 @@
 import torch
 
 
+MIN_CANN_VERSION_FOR_OPTIMIZED_MASK = "8.3"
+DEFAULT_MAX_SEQ_LEN = 0
+
+
 def _generate_attn_mask(max_seq_len, dtype):
     # Construct lower triangle matrix.
     mask_flag = torch.ones((max_seq_len, max_seq_len),
@@ -88,8 +92,9 @@ def get_splitfuse_attn_mask(
     ) -> torch.Tensor:
         cann_version = getattr(torch.version, "cann", "")
         target_device = device or self.device
-        use_chunked_mask = (seq_lens is None or position is None
-                            or dtype is None or cann_version.startswith("8.3"))
+        use_chunked_mask = (
+            seq_lens is None or position is None or dtype is None
+            or cann_version.startswith(MIN_CANN_VERSION_FOR_OPTIMIZED_MASK))
 
         if use_chunked_mask:
             if target_device is None:
@@ -106,7 +111,8 @@ def get_splitfuse_attn_mask(
         if target_device is None:
             raise ValueError(
                 "splitfuse_attn_mask requires device for non-chunked mask")
-        max_seq_len = seq_lens.max().item() if seq_lens.numel() > 0 else 0
+        max_seq_len = (seq_lens.max().item()
+                       if seq_lens.numel() > 0 else DEFAULT_MAX_SEQ_LEN)
         self._update_attn_cache(max_seq_len, dtype)
         # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
         # is not the same. Fix this in the future when kernel is ready.