[fix] UB overflow bugfix

OsirisDuan · OsirisDuan · commit 7aaf24ac6dc1 · 2025-12-01T14:16:36.000+08:00
Signed-off-by: Ascendyh &lt;hw7osiris@outlook.com&gt;
diff --git a/vllm_ascend/ops/fused_gdn_gating.py b/vllm_ascend/ops/fused_gdn_gating.py
@@ -2,6 +2,7 @@
 
 import triton
 import triton.language as tl
+import triton.runtime.driver as driver 
 
 # g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
 @triton.jit
@@ -16,24 +17,32 @@ def fused_gdn_gating_kernel(
     beta: tl.constexpr,
     threshold: tl.constexpr,
     BLK_HEADS: tl.constexpr,
-    BLK_BATCHES: tl.constexpr
+    COL_ITER: tl.constexpr,
+    BLK_BATCHES: tl.constexpr,
+    ROW_ITER: tl.constexpr,
 ):
-    i_b, i_s, i_d = tl.program_id(0), tl.program_id(1), tl.program_id(2)
-    head_off = i_d * BLK_HEADS + tl.arange(0, BLK_HEADS)
-    batch_off = i_b * BLK_BATCHES + tl.arange(0, BLK_BATCHES)
-    off = batch_off[:, None] * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off[None, :]
-    head_mask = head_off < NUM_HEADS
-    mask = head_mask[None, :] & (batch_off[:, None] < NUM_BATCHES)
-    blk_A_log = tl.load(A_log + head_off, mask=head_mask)
-    blk_a = tl.load(a + off, mask=mask)
-    blk_bias = tl.load(dt_bias + head_off, mask=head_mask)
-    # If the model is loaded in fp16, without the .float() here, A might be -inf
-    x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)[None, :]
-    softplus_x = tl.where(beta * x <= threshold,
-                        (1 / beta) * tl.log(1 + tl.exp(beta * x)), x)
-    blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
-    tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
+    # New impl
+    i_b, i_s = tl.program_id(0), tl.program_id(1)
+    for row_idx in range(0, ROW_ITER):
+        batch_off = i_b * ROW_ITER * BLK_BATCHES + row_idx * BLK_BATCHES + tl.arange(0, BLK_BATCHES)
 
+        for col_idx in range(0, COL_ITER):
+            head_off = col_idx * BLK_HEADS + tl.arange(0, BLK_HEADS)
+
+            off = batch_off[:, None] * seq_len * NUM_HEADS + i_s * NUM_HEADS + head_off[None, :]
+            head_mask = head_off < NUM_HEADS
+            mask = head_mask[None, :] & (batch_off[:, None] < NUM_BATCHES)
+            blk_A_log = tl.load(A_log + head_off, mask=head_mask)
+            blk_a = tl.load(a + off, mask=mask)
+            blk_bias = tl.load(dt_bias + head_off, mask=head_mask)
+            
+            x = blk_a.to(tl.float32) + blk_bias.to(tl.float32)[None, :]
+            softplus_x = tl.where(beta * x <= threshold,
+                                (1 / beta) * tl.log(1 + tl.exp(beta * x)), x)
+
+            blk_g = -tl.exp(blk_A_log.to(tl.float32)) * softplus_x
+
+            tl.store(g + off, blk_g.to(g.dtype.element_ty), mask=mask)
 
 def fused_gdn_gating(
     A_log: torch.Tensor,
@@ -44,14 +53,34 @@ def fused_gdn_gating(
 ) -> torch.Tensor:
     batch, num_heads = a.shape
     seq_len = 1
-    NUM_BATCH_GROUPS = batch
-    BLK_BATCHES = 1
-    if batch > 40:
-        BLK_BATCHES = triton.next_power_of_2(triton.cdiv(batch, 32))
-        NUM_BATCH_GROUPS = triton.cdiv(batch, BLK_BATCHES)
-        
-    grid = (NUM_BATCH_GROUPS, seq_len, triton.cdiv(num_heads, 8))
+
+    num_cores = driver.active.utils.get_device_properties(torch.npu.current_device())["num_vectorcore"]
+
+    # a_log_size = A_log.element_size() * A_log.nelement()
+    # a_size = a.element_size() * a.nelement()
+    # dt_bias_size = dt_bias.element_size() * dt_bias.nelement()
+    
+    # 1. Row
+    BLK_HEADS = 8 # TODO
+    COL_ITER = triton.cdiv(num_heads, BLK_HEADS)
+
+    # 2. Col
+    if batch <= num_cores:
+        progs = batch
+        BLK_BATCHES = 1
+        ROW_ITER = 1
+    else:
+        progs = num_cores
+
+        factor = 64 # Black box ub factor
+        row_per_core = triton.cdiv(batch, num_cores)
+        BLK_BATCHES = triton.next_power_of_2(triton.cdiv(1572864, factor * BLK_HEADS) // a.element_size()) // 2
+        ROW_ITER = triton.cdiv(row_per_core, BLK_BATCHES) 
+
+
     g = torch.empty_like(a, dtype=torch.float32)
+
+    grid = (progs, seq_len)
     fused_gdn_gating_kernel[grid](g,
                                   A_log,
                                   a,
@@ -61,7 +90,8 @@ def fused_gdn_gating(
                                   batch,
                                   beta,
                                   threshold,
-                                  8,
+                                  BLK_HEADS=BLK_HEADS,
+                                  COL_ITER=COL_ITER,
                                   BLK_BATCHES=BLK_BATCHES,
-                                  num_warps=1)
-    return g
+                                  ROW_ITER=ROW_ITER,)
+    return g