Add testcase for log_softmax

littledgg · littledgg · commit 02c328fe4f89 · 2025-11-04T19:07:09.000+08:00
diff --git a/custom_ops/batch_invariant_ops/batch_invariant_ops.py b/custom_ops/batch_invariant_ops/batch_invariant_ops.py
@@ -285,19 +285,19 @@ def _log_softmax_kernel(
         tl.store(output_row_start_ptr + col_idx, output, mask=mask)
 
 
-def log_softmax(input: paddle.Tensor, dim: int = -1) -> paddle.Tensor:
+def log_softmax(input: paddle.Tensor, axis: int = -1) -> paddle.Tensor:
     """
     Compute log_softmax using Triton kernel.
 
     Args:
         input: Input tensor
-        dim: Dimension along which to compute log_softmax (only -1 or last dim supported)
+        axis: Dimension along which to compute log_softmax (only -1 or last dim supported)
     >> Stashed changes
     Returns:
         Tensor with log_softmax applied along the specified dimension
     """
-    # TODO:use axis not dim in paddle
-    if dim != -1 and dim != input.ndim - 1:
+    # print("You are using triton impl for log_softmax")
+    if axis != -1 and axis != input.ndim - 1:
         raise ValueError("This implementation only supports log_softmax along the last dimension")
 
     # Flatten all dimensions except the last one
@@ -477,10 +477,8 @@ def addmm_batch_invariant(bias, a, b, alpha=1.0, beta=1.0):
     return result
 
 
-def _log_softmax_batch_invariant(input, dim, _half_to_float):
-    # TODO:use axis not dim in Paddle
-    assert not _half_to_float, "not implemented"
-    return log_softmax(input, dim=dim)
+def _log_softmax_batch_invariant(input, axis):
+    return log_softmax(input, axis=axis)
 
 
 def mean_batch_invariant(input, dim, keepdim=False, dtype: paddle.dtype | None = None):
@@ -511,12 +509,12 @@ def enable_batch_invariant_mode():
 
     _original_ops["mm"] = paddle._C_ops.matmul
     _original_ops["addmm"] = paddle._C_ops.addmm
-    _original_ops["log_softmax"] = paddle.nn.functional.log_softmax
+    _original_ops["log_softmax"] = paddle._C_ops.log_softmax
     _original_ops["mean"] = paddle.mean
 
     paddle._C_ops.matmul = mm_batch_invariant
     paddle._C_ops.addmm = addmm_batch_invariant
-    paddle.nn.functional.log_softmax = _log_softmax_batch_invariant
+    paddle._C_ops.log_softmax = _log_softmax_batch_invariant
     paddle.mean = mean_batch_invariant
 
     _batch_invariant_MODE = True
@@ -532,7 +530,7 @@ def disable_batch_invariant_mode():
     if _original_ops["addmm"]:
         paddle._C_ops.addmm = _original_ops["addmm"]
     if _original_ops["log_softmax"]:
-        paddle.nn.functional.log_softmax = _original_ops["log_softmax"]
+        paddle._C_ops.log_softmax = _original_ops["log_softmax"]
     if _original_ops["mean"]:
         paddle.mean = _original_ops["mean"]
 
@@ -543,7 +541,6 @@ def disable_batch_invariant_mode():
 def set_batch_invariant_mode(enabled: bool = True):
     global _batch_invariant_MODE, _original_ops
     old_mode = _batch_invariant_MODE
-    # old_ops = _original_ops.copy()
     if enabled:
         enable_batch_invariant_mode()
     else:
diff --git a/tests/batch_invariant/test_batch_invariance_op_logsoftmax.py b/tests/batch_invariant/test_batch_invariance_op_logsoftmax.py
@@ -0,0 +1,109 @@
+# Adapted from https://github.com/thinking-machines-lab/batch_invariant_ops/blob/main/batch_invariant_ops/test_batch_invariance.py
+
+import random
+import unittest
+
+import paddle
+
+from custom_ops.batch_invariant_ops import set_batch_invariant_mode
+
+
+class TestBatchInvariantForLogsoftmax(unittest.TestCase):
+    def setUp(self):
+        """
+        Initialize the test environment
+        """
+        device = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
+        paddle.set_device(device)
+
+    def create_softmax_trap_tensor(self, B, D, dtype):
+        """
+        Constructs a "trap" tensor designed to trigger batch-invariance issues in Softmax/LogSoftmax.
+        Inspired by https://thinkingmachines.ai/blog/defeating-nondeterminism-in-llm-inference/
+
+        Principle:
+        The goal is to make the result of `exp(a - max(a))` contain numbers spanning an extremely wide numerical range
+        (e.g., 1.0, 1e-5, 1e-10, and many numbers close to 0).
+        When summing these numbers using parallel reduction, different summation orders (due to parallelism)
+        can produce different accumulated rounding errors, leading to a subtle difference between
+        batch (parallel) and single-sample (serial) computation results.
+        """
+        # 1. Determine the desired values after `exp` and calculate the required input values using log().
+        max_val = 20.0
+
+        # Offsets relative to max_val. These offsets result in values spanning vastly different orders of magnitude after exp.
+        trap_values = [
+            max_val,  # Corresponds to exp(a-max) -> 1.0
+            max_val - 4.6,  # Corresponds to exp(a-max) -> ~1e-2
+            max_val - 11.5,  # Corresponds to exp(a-max) -> ~1e-5
+            max_val - 23.0,  # Corresponds to exp(a-max) -> ~1e-10
+        ]
+
+        # 2. Create a background tensor filled with a very large negative number.
+        background_val = -1000.0
+        a = paddle.full((B, D), background_val, dtype=dtype)
+
+        # 3. Scatter these "trap" values at random positions in each row.
+        for i in range(B):
+            # Randomly shuffle the positions of the trap values for each row to increase non-determinism.
+            indices = random.sample(range(D), k=len(trap_values))
+            for j, val in enumerate(trap_values):
+                a[i, indices[j]] = val
+
+        return a
+
+    def test_batch_invariance(self, B: int = 2048, D: int = 4096, dtype=paddle.float32):
+        a = self.create_softmax_trap_tensor(B, D, dtype)
+
+        # Method 1: Matrix-vector multiplication (batch size 1)
+        out1 = paddle.nn.functional.log_softmax(a[:1])
+
+        # Method 2: Matrix-matrix multiplication, then slice (full batch)
+        out2 = paddle.nn.functional.log_softmax(a)[:1]
+
+        # Check if results are identical
+        diff = (out1 - out2).abs().max()
+        return diff.item() == 0, diff
+
+    def run_iters(self, iters=10, ass=False):
+        for dtype in [paddle.float32, paddle.bfloat16, paddle.float16]:
+            is_deterministic = True
+            difflist = []
+            for i in range(iters):
+                isd, df = self.test_batch_invariance(dtype=dtype)
+                is_deterministic = is_deterministic and isd
+                difflist.append(df)
+            print(
+                f"Batch Deterministic: {is_deterministic} run-to-run max/min/diff {max(difflist)}/{min(difflist)}/{max(difflist)-min(difflist)} for {dtype} in {iters} iterations"
+            )
+            if ass:
+                assert max(difflist) == 0
+
+    def test_case(self):
+        # Test with standard Paddle (likely to show differences)
+        print("Standard Paddle:")
+        with set_batch_invariant_mode(False):
+            self.run_iters(ass=False)
+        # Test with batch-invariant operations
+        print("\nBatch-Invariant Mode:")
+        with set_batch_invariant_mode(True):
+            self.run_iters(ass=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
+    """
+    Even in Standard Paddle, we can achieve deterministic results, so maybe the standard implementation is already batch-invariant?
+
+    Result:
+
+    Standard Paddle:
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.float32 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.bfloat16 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.float16 in 10 iterations
+
+    Batch-Invariant Mode:
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.float32 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.bfloat16 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.float16 in 10 iterations
+    """
diff --git a/tests/batch_invariant/test_batch_invariance_op_mm.py b/tests/batch_invariant/test_batch_invariance_op_mm.py
@@ -56,3 +56,13 @@ def test_case(self):
 
 if __name__ == "__main__":
     unittest.main()
+    """
+
+    Standard Paddle:
+    Batch Deterministic: False run-to-run max/min/diff 10.7294921875/10.7294921875/0.0 for paddle.float32 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.bfloat16 in 10 iterations
+
+    Batch-Invariant Mode:
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.float32 in 10 iterations
+    Batch Deterministic: True run-to-run max/min/diff 0.0/0.0/0.0 for paddle.bfloat16 in 10 iterations
+    """