vllm-project
diff --git a/‎tests/compile/distributed/test_fusion_all_reduce.py‎
Lines changed: 319 additions & 0 deletions b/‎tests/compile/distributed/test_fusion_all_reduce.py‎
Lines changed: 319 additions & 0 deletions
@@ -36,6 +36,25 @@
 from ...utils import has_module_attribute, multi_gpu_test
 from ..backend import TestBackend
 
+# Helion imports
+try:
+    import torch.distributed._symmetric_memory as symm_mem
+    from vllm.compilation.helion.allreduce_add_rmsnorm import (
+        helion_allreduce_add_rmsnorm,
+    )
+
+    HELION_AVAILABLE = True
+except ImportError:
+    HELION_AVAILABLE = False
+
+# FlashInfer imports for baseline comparison
+try:
+    import flashinfer.comm as flashinfer_comm
+
+    FLASHINFER_AVAILABLE = True
+except ImportError:
+    FLASHINFER_AVAILABLE = False
+
 
 class TestAllReduceRMSNormModel(torch.nn.Module):
     def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
@@ -192,6 +211,33 @@ def ops_in_model_before(self):
         ]
 
 
+class TestHelionAllReduceAddRMSNormModel(torch.nn.Module):
+    """Test model using Helion AllReduce + Add + RMSNorm fusion."""
+
+    def __init__(self, hidden_size=16, token_num=16, eps=1e-6):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.token_num = token_num
+        self.eps = eps
+        self.norm = RMSNorm(hidden_size, eps)
+        self.rms_gamma = self.norm.weight
+
+    def forward(self, input_shared, residual):
+        """
+        Forward pass using Helion fused op.
+
+        Args:
+            input_shared: Symmetric tensor to be all-reduced
+            residual: Residual tensor to add
+
+        Returns:
+            Tuple of (normalized_output, updated_residual)
+        """
+        return helion_allreduce_add_rmsnorm(
+            input_shared, residual, self.rms_gamma, self.eps, splits_per_rank=4
+        )
+
+
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize(
     "test_model, enable_quant_fp8_custom_op",
@@ -330,3 +376,276 @@ def all_reduce_fusion_pass_on_test_model(
         backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
         backend.check_after_ops(model.ops_in_model_after())
         del all_reduce_fusion_pass
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8, 16])
+@pytest.mark.parametrize("seq_len", [8, 16])
+@pytest.mark.parametrize("hidden_size", [64, 128])
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("splits_per_rank", [2, 4])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"], reason="Only test on CUDA")
+@pytest.mark.skipif(not HELION_AVAILABLE, reason="Helion not available")
+@pytest.mark.skipif(not FLASHINFER_AVAILABLE, reason="FlashInfer not available")
+def test_helion_allreduce_add_rmsnorm(
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    splits_per_rank: int,
+):
+    """
+    Test Helion AllReduce + Add + RMSNorm fusion.
+
+    This test validates:
+    1. Numerical correctness against FlashInfer baseline
+    2. Performance comparison against FlashInfer baseline
+
+    Args:
+        batch_size: Batch size for the test
+        seq_len: Sequence length for the test
+        hidden_size: Hidden dimension size
+        dtype: Data type (bfloat16 or float16)
+        splits_per_rank: Number of splits for progressive AllReduce
+    """
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        torch.multiprocessing.spawn(
+            fn,
+            args=(
+                num_processes,
+                batch_size,
+                seq_len,
+                hidden_size,
+                dtype,
+                splits_per_rank,
+            ),
+            nprocs=nprocs,
+        )
+
+    run_torch_spawn(helion_allreduce_add_rmsnorm_worker, num_processes)
+
+
+def helion_allreduce_add_rmsnorm_worker(
+    local_rank: int,
+    world_size: int,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    dtype: torch.dtype,
+    splits_per_rank: int,
+):
+    """Worker function for testing Helion AllReduce + Add + RMSNorm."""
+    import torch.distributed as dist
+
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    # Initialize distributed environment
+    update_environment_variables(
+        {
+            "RANK": str(local_rank),
+            "LOCAL_RANK": str(local_rank),
+            "WORLD_SIZE": str(world_size),
+            "MASTER_ADDR": "localhost",
+            "MASTER_PORT": "12346",  # Different port from other tests
+        }
+    )
+
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    token_num = batch_size * seq_len
+    M, K = token_num, hidden_size
+    rms_eps = 1e-6
+
+    # ========== Setup FlashInfer baseline ==========
+    flashinfer_ipc_handles, flashinfer_workspace = (
+        flashinfer_comm.trtllm_create_ipc_workspace_for_all_reduce_fusion(
+            tp_rank=local_rank,
+            tp_size=world_size,
+            max_token_num=M,
+            hidden_dim=K,
+            group=dist.group.WORLD,
+            use_fp32_lamport=False,
+        )
+    )
+
+    # ========== Test Numerical Correctness ==========
+    # Create test data (same seed across ranks for reproducibility)
+    torch.manual_seed(42 + local_rank)  # Different data per rank
+    input_data = torch.randn(M, K, dtype=dtype, device=device)
+    residual_data = torch.randn(M, K, dtype=dtype, device=device)
+    rms_gamma = torch.ones(K, dtype=dtype, device=device)
+
+    # Run FlashInfer baseline
+    input_baseline = symm_mem.empty(M, K, dtype=dtype, device=device)
+    input_baseline.copy_(input_data)
+    residual_baseline = residual_data.clone()
+
+    norm_out_baseline = input_baseline  # FlashInfer operates in-place
+    residual_out_baseline = residual_baseline
+
+    flashinfer_comm.trtllm_allreduce_fusion(
+        allreduce_in=input_baseline,
+        token_num=M,
+        residual_in=residual_baseline,
+        residual_out=residual_out_baseline,
+        norm_out=norm_out_baseline,
+        rms_gamma=rms_gamma,
+        rms_eps=rms_eps,
+        hidden_dim=K,
+        workspace_ptrs=flashinfer_workspace,
+        pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+        allreduce_out=None,
+        quant_out=None,
+        scale_out=None,
+        layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+        scale_factor=None,
+        use_oneshot=False,
+        world_rank=local_rank,
+        world_size=world_size,
+        launch_with_pdl=True,
+        trigger_completion_at_end=True,
+        fp32_acc=True,
+    )
+    torch.cuda.synchronize()
+
+    # Run Helion
+    input_helion = symm_mem.empty(M, K, dtype=dtype, device=device)
+    input_helion.copy_(input_data)
+    residual_helion = residual_data.clone()
+
+    norm_out_helion, residual_out_helion = helion_allreduce_add_rmsnorm(
+        input_helion, residual_helion, rms_gamma, rms_eps, splits_per_rank
+    )
+    torch.cuda.synchronize()
+
+    # Compare results
+    # Use relaxed tolerances for bfloat16 and for accumulated errors
+    if dtype == torch.bfloat16:
+        rtol, atol = 1e-2, 1e-2
+    else:
+        rtol, atol = 1e-3, 1e-3
+
+    # Check normalized output
+    torch.testing.assert_close(
+        norm_out_helion,
+        norm_out_baseline,
+        rtol=rtol,
+        atol=atol,
+        msg=f"Normalized output mismatch (rank={local_rank}, dtype={dtype})",
+    )
+
+    # Check residual output
+    torch.testing.assert_close(
+        residual_out_helion,
+        residual_out_baseline,
+        rtol=rtol,
+        atol=atol,
+        msg=f"Residual output mismatch (rank={local_rank}, dtype={dtype})",
+    )
+
+    if local_rank == 0:
+        print(
+            f"✓ Numerical correctness test passed "
+            f"(M={M}, K={K}, dtype={dtype}, splits={splits_per_rank})"
+        )
+
+    # ========== Performance Comparison ==========
+    num_iterations = 20
+    warmup = 5
+
+    def time_kernel(kernel_fn):
+        # Warmup
+        for _ in range(warmup):
+            kernel_fn()
+        torch.cuda.synchronize()
+
+        # Benchmark
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        start_event.record()
+        for _ in range(num_iterations):
+            kernel_fn()
+        end_event.record()
+
+        torch.cuda.synchronize()
+
+        return start_event.elapsed_time(end_event) / num_iterations
+
+    # Benchmark FlashInfer
+    input_baseline_perf = symm_mem.empty(M, K, dtype=dtype, device=device)
+    residual_baseline_perf = torch.empty(M, K, dtype=dtype, device=device)
+    input_data_perf = torch.randn(M, K, dtype=dtype, device=device)
+    residual_data_perf = torch.randn(M, K, dtype=dtype, device=device)
+
+    def baseline_fn():
+        input_baseline_perf.copy_(input_data_perf)
+        residual_baseline_perf.copy_(residual_data_perf)
+
+        flashinfer_comm.trtllm_allreduce_fusion(
+            allreduce_in=input_baseline_perf,
+            token_num=M,
+            residual_in=residual_baseline_perf,
+            residual_out=residual_baseline_perf,
+            norm_out=input_baseline_perf,
+            rms_gamma=rms_gamma,
+            rms_eps=rms_eps,
+            hidden_dim=K,
+            workspace_ptrs=flashinfer_workspace,
+            pattern_code=flashinfer_comm.AllReduceFusionPattern.kARResidualRMSNorm,
+            allreduce_out=None,
+            quant_out=None,
+            scale_out=None,
+            layout_code=flashinfer_comm.QuantizationSFLayout.SWIZZLED_128x4,
+            scale_factor=None,
+            use_oneshot=False,
+            world_rank=local_rank,
+            world_size=world_size,
+            launch_with_pdl=True,
+            trigger_completion_at_end=True,
+            fp32_acc=True,
+        )
+
+    dist.barrier()
+    baseline_time = time_kernel(baseline_fn)
+    dist.barrier()
+
+    # Benchmark Helion
+    input_helion_perf = symm_mem.empty(M, K, dtype=dtype, device=device)
+    residual_helion_perf = torch.empty(M, K, dtype=dtype, device=device)
+
+    def helion_fn():
+        input_helion_perf.copy_(input_data_perf)
+        residual_helion_perf.copy_(residual_data_perf)
+
+        helion_allreduce_add_rmsnorm(
+            input_helion_perf, residual_helion_perf, rms_gamma, rms_eps, splits_per_rank
+        )
+
+    dist.barrier()
+    helion_time = time_kernel(helion_fn)
+    dist.barrier()
+
+    if local_rank == 0:
+        speedup = baseline_time / helion_time
+        print(f"✓ Performance comparison (M={M}, K={K}, dtype={dtype}):")
+        print(f"  FlashInfer: {baseline_time:.4f} ms")
+        print(f"  Helion:     {helion_time:.4f} ms")
+        print(f"  Speedup:    {speedup:.2f}x")
+
+    # Cleanup
+    try:
+        flashinfer_comm.trtllm_destroy_ipc_workspace_for_all_reduce_fusion(
+            flashinfer_ipc_handles
+        )
+    except:
+        pass
+