fix kernel numerical error

grimoire · grimoire · commit e90420703f46 · 2025-11-18T17:40:51.000+08:00
diff --git a/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py b/lmdeploy/pytorch/kernels/cuda/apply_rotary_pos_emb.py
@@ -5,6 +5,25 @@
 from torch import Tensor
 
 
+@triton.jit
+def _apply_rotary_impl(x_l, x_h, cos_l, cos_h, sin_l, sin_h):
+    """Apply rotary positional embedding implementation."""
+    # x_l, x_h: [BLOCK, BLOCK_N]
+    # cos_l, cos_h, sin_l, sin_h: [BLOCK, BLOCK_N]
+
+    # qe_l = q_l * cos_l - q_h * sin_l
+    # qe_h = q_h * cos_h + q_l * sin_h
+
+    # triton 3.4 would do fma 3 times to perform the above computation,
+    # which causes higher numerical error. So we manually expand the
+    # computation to avoid fma.
+    x_l_new = x_l * cos_l + 0
+    x_l_new -= x_h * sin_l + 0
+    x_h_new = x_h * cos_h + 0
+    x_h_new += x_l * sin_h + 0
+    return x_l_new, x_h_new
+
+
 @triton.jit(do_not_specialize=('seq_len', ))
 def apply_rotary_pos_emb_qk_kernel(
     Q,
@@ -67,8 +86,8 @@ def apply_rotary_pos_emb_qk_kernel(
 
         q_l = tl.load(ql_ptrs)
         q_h = tl.load(qh_ptrs)
-        qe_l = q_l * cos_l - q_h * sin_l
-        qe_h = q_h * cos_h + q_l * sin_h
+
+        qe_l, qe_h = _apply_rotary_impl(q_l, q_h, cos_l, cos_h, sin_l, sin_h)
 
         tl.store(qel_ptrs, qe_l, mask=seq_mask)
         tl.store(qeh_ptrs, qe_h, mask=seq_mask)
@@ -86,8 +105,8 @@ def apply_rotary_pos_emb_qk_kernel(
         keh_ptrs += head_id * stride_keh
         k_l = tl.load(kl_ptrs)
         k_h = tl.load(kh_ptrs)
-        ke_l = k_l * cos_l - k_h * sin_l
-        ke_h = k_h * cos_h + k_l * sin_h
+
+        ke_l, ke_h = _apply_rotary_impl(k_l, k_h, cos_l, cos_h, sin_l, sin_h)
 
         tl.store(kel_ptrs, ke_l, mask=seq_mask)
         tl.store(keh_ptrs, ke_h, mask=seq_mask)
diff --git a/lmdeploy/pytorch/kernels/cuda/rms_norm.py b/lmdeploy/pytorch/kernels/cuda/rms_norm.py
@@ -14,7 +14,7 @@ def _compute_rms_norm(x, w, eps: tl.constexpr, N_COLS: tl.constexpr):
 
     var = tl.sum(xf * xf, 0) * float(1.0 / N_COLS)
     out = xf * tl.math.rsqrt(var + eps)
-    out = (w * out).to(x.dtype)
+    out = w * out.to(x.dtype)
     return out
 
 
@@ -27,7 +27,7 @@ def rms_norm_kernel(input, weight, output, seq_len, input_row_stride: tl.constex
     offsets = tl.arange(0, BLOCK_N)
     mask = offsets < N_COLS
 
-    w = tl.load(weight + offsets, mask=mask).to(tl.float32)
+    w = tl.load(weight + offsets, mask=mask)
 
     x_ptr = input + prog_id * input_row_stride + offsets
     out_ptr = output + prog_id * input_row_stride + offsets
@@ -50,7 +50,7 @@ def add_rms_norm_kernel(input, weight, residual, output, out_residual, seq_len,
     offsets = tl.arange(0, BLOCK_N)
     mask = offsets < N_COLS
 
-    w = tl.load(weight + offsets, mask=mask).to(tl.float32)
+    w = tl.load(weight + offsets, mask=mask)
 
     x_ptr = input + prog_id * input_row_stride + offsets
     res_ptr = residual + prog_id * residual_row_stride + offsets
diff --git a/tests/pytorch/kernel/test_apply_rotary.py b/tests/pytorch/kernel/test_apply_rotary.py
@@ -35,7 +35,7 @@ def num_heads_k(self, request):
 
     @pytest.fixture
     def feature_dim(self):
-        yield 16
+        yield 128
 
     @pytest.fixture
     def seq_length(self, batch_size):
@@ -47,23 +47,23 @@ def max_seqlen(self, seq_length):
 
     @pytest.fixture
     def q_states(self, seq_length, num_heads_q, feature_dim, dtype):
-        yield torch.rand(seq_length.sum(), num_heads_q, feature_dim, dtype=dtype, device='cuda')
+        yield torch.randn(seq_length.sum(), num_heads_q, feature_dim, dtype=dtype, device='cuda')
 
     @pytest.fixture
     def k_states(self, seq_length, num_heads_k, feature_dim, dtype):
-        yield torch.rand(seq_length.sum(), num_heads_k, feature_dim, dtype=dtype, device='cuda')
+        yield torch.randn(seq_length.sum(), num_heads_k, feature_dim, dtype=dtype, device='cuda')
 
     @pytest.fixture
     def position_ids_1d(self, seq_length, max_seqlen):
         yield torch.randint(0, max_seqlen.item(), (seq_length.sum().item(), ), device='cuda')
 
     @pytest.fixture
     def cached_cos(self, max_seqlen, feature_dim, dtype):
-        yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
+        yield torch.randn(max_seqlen, feature_dim, dtype=dtype, device='cuda')
 
     @pytest.fixture
     def cached_sin(self, max_seqlen, feature_dim, dtype):
-        yield torch.rand(max_seqlen, feature_dim, dtype=dtype, device='cuda')
+        yield torch.randn(max_seqlen, feature_dim, dtype=dtype, device='cuda')
 
     @pytest.fixture
     def cos(self, cached_cos, position_ids_1d):
@@ -91,11 +91,5 @@ def test_apply_rotary(self, q_states, k_states, cos, sin, gt):
 
         rtol = None
         atol = None
-        if q_states.dtype == torch.float16:
-            rtol = 1e-5
-            atol = 1e-3
-        elif q_states.dtype == torch.bfloat16:
-            rtol = 1e-5
-            atol = 1e-2
         torch.testing.assert_close(q_embed, q_gt, rtol=rtol, atol=atol)
         torch.testing.assert_close(k_embed, k_gt, rtol=rtol, atol=atol)
diff --git a/tests/pytorch/kernel/test_rms_norm.py b/tests/pytorch/kernel/test_rms_norm.py
@@ -15,12 +15,16 @@ def dtype(self, request):
         yield request.param
 
     @pytest.fixture(scope='class')
-    def input(self, dtype):
-        yield torch.rand(4, 8, dtype=dtype, device='cuda')
+    def hidden_size(self):
+        yield 4096
 
     @pytest.fixture(scope='class')
-    def weight(self, dtype):
-        yield torch.rand(8, dtype=dtype, device='cuda')
+    def input(self, dtype, hidden_size):
+        yield torch.randn(4, hidden_size, dtype=dtype, device='cuda')
+
+    @pytest.fixture(scope='class')
+    def weight(self, dtype, hidden_size):
+        yield torch.randn(hidden_size, dtype=dtype, device='cuda')
 
     @pytest.fixture(scope='class')
     def eps(self):