Update input shapes for example kernels (#845)

yf225 · web-flow · commit 6581aac9a8c9 · 2025-10-08T17:24:36.000-04:00
diff --git a/examples/add.py b/examples/add.py
@@ -69,7 +69,7 @@ def main() -> None:
     """
     Main entry point that runs the add kernel verification with 1024x1024 tensors.
     """
-    check(1024, 1024)
+    check(10240, 10240)
 
 
 if __name__ == "__main__":
diff --git a/examples/cross_entropy.py b/examples/cross_entropy.py
@@ -79,12 +79,11 @@ def cross_entropy(
 def main() -> None:
     """
     Main entry point that runs the cross entropy kernel verification.
-    Tests with a batch size of 128 and vocabulary size of 1000.
     """
-    # Test with moderate size
-    n, v = 128, 1000
-    logits = torch.randn(n, v, device="cuda", dtype=torch.float32)
-    labels = torch.randint(0, v, (n,), device="cuda", dtype=torch.long)
+    batch_size, seq_len, vocab_size = 8, 2048, 131072
+    n = batch_size * seq_len
+    logits = torch.randn(n, vocab_size, device="cuda", dtype=torch.float32)
+    labels = torch.randint(0, vocab_size, (n,), device="cuda", dtype=torch.long)
 
     run_example(
         cross_entropy,
diff --git a/examples/exp.py b/examples/exp.py
@@ -132,9 +132,9 @@ def check(n: int) -> None:
 # -----------
 def main() -> None:
     """
-    Main entry point that runs the exp kernel verification with a tensor of size 1M elements.
+    Main entry point that runs the exp kernel verification.
     """
-    check(1024 * 1024)
+    check(10240 * 10240)
 
 
 if __name__ == "__main__":
diff --git a/examples/geglu.py b/examples/geglu.py
@@ -280,7 +280,7 @@ def main() -> None:
     print("Testing GEGLU kernel...")
 
     # Test GEGLU kernel with different shapes
-    kernel_test_shapes = [(8, 128, 1024), (4, 1024, 2048)]
+    kernel_test_shapes = [(8, 2048, 4096), (8, 4096, 8192)]
 
     for shape in kernel_test_shapes:
         print(f"Testing GEGLU kernel shape: {shape}")
@@ -291,8 +291,8 @@ def main() -> None:
 
     # Test GEGLU MLP with transformer-typical sizes
     mlp_test_configs = [
-        (2, 128, 512, 2048),  # Small transformer
-        (8, 1024, 4096, 11008),  # LLaMA-style config
+        (8, 2048, 4096, 11008),
+        (8, 4096, 8192, 11008),
     ]
 
     for batch_size, seq_len, hidden_size, intermediate_size in mlp_test_configs:
diff --git a/examples/int4_gemm.py b/examples/int4_gemm.py
@@ -163,9 +163,8 @@ def main() -> None:
     """
     Main function to run tests with different matrix sizes.
     """
-    check(256, 512, 256)
-    check(512, 512, 512)
-    check(1024, 1024, 1024)
+    check(4, 8192, 7168)
+    check(8192, 8192, 8192)
 
 
 # %%
diff --git a/examples/jsd.py b/examples/jsd.py
@@ -326,7 +326,7 @@ def main() -> None:
     ignore_index = -100
     use_labels = False
 
-    for V in [2**i for i in range(12, 18)]:
+    for V in [2**i for i in range(16, 18)]:
         print(
             f"Testing JSD: B={B}, T={T}, V={V}, beta={beta}, ignore_index={ignore_index}, labels={use_labels}"
         )
diff --git a/examples/kl_div.py b/examples/kl_div.py
@@ -244,8 +244,8 @@ def main() -> None:
     log_target = False
     eps = 1e-10
 
-    # Test with vocabulary sizes from tritonbench (2^12 to 2^17)
-    for V in [2**i for i in range(12, 18)]:
+    # Test with vocabulary sizes from tritonbench (2^16 to 2^17)
+    for V in [2**i for i in range(16, 18)]:
         print(
             f"Testing KL Div: B={B}, T={T}, V={V}, reduction={reduction}, log_target={log_target}"
         )
diff --git a/examples/layer_norm.py b/examples/layer_norm.py
@@ -278,8 +278,8 @@ def main() -> None:
       built-in layer_norm function using the run_example utility.
     - Prints comparison results and checks for correctness within specified tolerances.
     """
-    batch_size = 32
-    dim = 64
+    batch_size = 4096
+    dim = 10240
     device = "cuda"
 
     # Test forward pass only
diff --git a/examples/rms_norm.py b/examples/rms_norm.py
@@ -240,17 +240,9 @@ def check(m: int, n: int) -> None:
 def main() -> None:
     """
     Main entry point that runs the RMS norm kernel verification with different tensor sizes.
-
-    Tests with configurations:
-    - 32x64
-    - 128x256
-    - 1024x1024
-    - 2048x1024
     """
-    check(32, 64)
-    check(128, 256)
-    check(1024, 1024)
-    check(2048, 1024)
+    check(2048, 4096)
+    check(2048, 8192)
 
 
 if __name__ == "__main__":
diff --git a/examples/softmax.py b/examples/softmax.py
@@ -111,7 +111,7 @@ def main() -> None:
     """
     Main function to run the softmax kernel correctness check with example input size.
     """
-    check(1024, 1024)
+    check(4096, 2560)
 
 
 # %%
diff --git a/examples/sum.py b/examples/sum.py
@@ -90,13 +90,9 @@ def check(m: int, n: int) -> None:
 def main() -> None:
     """
     Main entry point that runs the sum kernel verification with different tensor sizes.
-
-    Tests with two configurations:
-    - 512x256
-    - 1024x1024
     """
-    check(512, 256)
-    check(1024, 1024)
+    check(5120, 2560)
+    check(10240, 10240)
 
 
 if __name__ == "__main__":
diff --git a/examples/swiglu.py b/examples/swiglu.py
@@ -270,7 +270,7 @@ def main() -> None:
     print("Testing SwiGLU kernel...")
 
     # Test SwiGLU kernel with different shapes
-    kernel_test_shapes = [(8, 128, 1024), (4, 1024, 2048)]
+    kernel_test_shapes = [(4, 8192, 4096), (8, 8192, 4096)]
 
     for shape in kernel_test_shapes:
         print(f"Testing SwiGLU kernel shape: {shape}")
@@ -281,8 +281,8 @@ def main() -> None:
 
     # Test SwiGLU MLP with transformer-typical sizes
     mlp_test_configs = [
-        (2, 128, 512, 2048),  # Small transformer
-        (8, 1024, 4096, 11008),  # LLaMA-style config
+        (4, 8192, 4096, 11008),
+        (8, 8192, 4096, 11008),
     ]
 
     for batch_size, seq_len, hidden_size, intermediate_size in mlp_test_configs:
diff --git a/examples/welford.py b/examples/welford.py
@@ -110,11 +110,8 @@ def check(s: int, d: int) -> None:
 def main() -> None:
     """
     Main entry point that runs the welford kernel verification with different tensor sizes.
-
-    Tests with two configurations:
-    - 262144x1536
-    - 262144x2048
     """
+    check(262144, 1024)
     check(262144, 1536)
     check(262144, 2048)
 

Original file line number	Diff line number	Diff line change
`@@ -326,7 +326,7 @@ def main() -> None:`
`326`	`326`	`ignore_index = -100`
`327`	`327`	`use_labels = False`
`328`	`328`
`329`		`- for V in [2**i for i in range(12, 18)]:`
	`329`	`+ for V in [2**i for i in range(16, 18)]:`
`330`	`330`	`print(`
`331`	`331`	`f"Testing JSD: B={B}, T={T}, V={V}, beta={beta}, ignore_index={ignore_index}, labels={use_labels}"`
`332`	`332`	`)`