pytorch
diff --git a/‎examples/add.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/add.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/all_gather_matmul.py‎
Lines changed: 4 additions & 1 deletion b/‎examples/all_gather_matmul.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/all_reduce.py‎
Lines changed: 3 additions & 0 deletions b/‎examples/all_reduce.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/attention.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/attention.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/bf16xint16_gemm.py‎
Lines changed: 5 additions & 4 deletions b/‎examples/bf16xint16_gemm.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/bmm.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/bmm.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/concatenate.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/concatenate.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/cross_entropy.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/cross_entropy.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/embedding.py‎
Lines changed: 3 additions & 2 deletions b/‎examples/embedding.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎examples/exp.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/exp.py‎
Lines changed: 2 additions & 1 deletion
@@ -15,6 +15,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -64,8 +65,8 @@ def check(m: int, n: int) -> None:
         m: First dimension of the test tensors
         n: Second dimension of the test tensors
     """
-    x = torch.randn([m, n], device="cuda", dtype=torch.float16)
-    y = torch.randn([m, n], device="cuda", dtype=torch.float16)
+    x = torch.randn([m, n], device=DEVICE, dtype=torch.float16)
+    y = torch.randn([m, n], device=DEVICE, dtype=torch.float16)
     run_example(add, torch.add, (x, y))
 
 
 
@@ -20,6 +20,7 @@
 import torch.distributed._symmetric_memory as symm_mem
 
 import helion
+from helion._testing import DEVICE
 import helion.language as hl
 
 
@@ -201,7 +202,7 @@ def test(M: int, N: int, K: int, world_size: int, device: torch.device) -> None:
     a_shared = symm_mem.empty(
         M // world_size, K, dtype=torch.bfloat16, device=device
     ).normal_()
-    b = torch.randn((K, N), device="cuda", dtype=torch.bfloat16).T.contiguous().T
+    b = torch.randn((K, N), device=DEVICE, dtype=torch.bfloat16).T.contiguous().T
     a_out, c = helion_all_gather_matmul(a_shared, b)
     golden_a = a_shared.clone()
     dist_group = dist.group.WORLD
@@ -239,4 +240,6 @@ def main() -> None:
     --rdzv-backend c10d --rdzv-endpoint localhost:0 \
     --no_python python3 examples/all_gather_matmul.py
     """
+    # TODO(adam-smnk): generalize to XPU
+    assert DEVICE.type == "cuda", "Requires CUDA device"
     main()
@@ -22,6 +22,7 @@
 from torch.utils.cpp_extension import load_inline
 
 import helion
+from helion._testing import DEVICE
 import helion.language as hl
 
 # %%
@@ -273,4 +274,6 @@ def main() -> None:
     --rdzv-backend c10d --rdzv-endpoint localhost:0 \
     --no_python python3 examples/all_reduce.py
     """
+    # TODO(adam-smnk): generalize to XPU
+    assert DEVICE.type == "cuda", "Requires CUDA device"
     main()
@@ -21,6 +21,7 @@
 from torch.nn.attention.flex_attention import flex_attention
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -165,7 +166,7 @@ def main() -> None:
     Main entry point that runs the attention kernel test with specific parameters.
     Tests with batch size 2, 32 heads, 1024 sequence length, and 64-dimensional heads using float16.
     """
-    test(2, 32, 1024, 64, torch.float16)
+    test(2, 32, 1024, 64, torch.float16, device=DEVICE)
 
 
 if __name__ == "__main__":
 
@@ -14,6 +14,7 @@
 from torch import Tensor
 
 import helion
+from helion._testing import DEVICE
 import helion.language as hl
 
 
@@ -137,17 +138,17 @@ def check(m: int, k: int, n: int) -> None:
         k (int): Shared dimension.
         n (int): Number of cols.
     """
-    x = torch.randn([m, k], device="cuda", dtype=torch.bfloat16)
-    w = torch.randint(-(2**15), 2**15 - 1, (k, n), device="cuda", dtype=torch.int16)
+    x = torch.randn([m, k], device=DEVICE, dtype=torch.bfloat16)
+    w = torch.randint(-(2**15), 2**15 - 1, (k, n), device=DEVICE, dtype=torch.int16)
 
     result = bf16xint16_gemm(x, w, transpose=False)
     expected = reference_bf16xint16_pytorch(x, w, transpose=False)
     torch.testing.assert_close(result, expected, rtol=1e-2, atol=1e-2)
 
     x_int16 = torch.randint(
-        -(2**15), 2**15 - 1, (m, k), device="cuda", dtype=torch.int16
+        -(2**15), 2**15 - 1, (m, k), device=DEVICE, dtype=torch.int16
     )
-    w_bf16 = torch.randn([k, n], device="cuda", dtype=torch.bfloat16)
+    w_bf16 = torch.randn([k, n], device=DEVICE, dtype=torch.bfloat16)
 
     result = bf16xint16_gemm(x_int16, w_bf16, transpose=True)
     expected = reference_bf16xint16_pytorch(x_int16, w_bf16, transpose=True)
 
@@ -16,6 +16,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -70,8 +71,8 @@ def check(b: int, m: int, k: int, n: int) -> None:
         k: Second dimension of the first matrix / First dimension of the second matrix
         n: Second dimension of the second matrix
     """
-    x = torch.randn([b, m, k], device="cuda", dtype=torch.float16)
-    y = torch.randn([b, k, n], device="cuda", dtype=torch.float16)
+    x = torch.randn([b, m, k], device=DEVICE, dtype=torch.float16)
+    y = torch.randn([b, k, n], device=DEVICE, dtype=torch.float16)
     run_example(bmm, torch.bmm, (x, y))
 
 
 
@@ -15,6 +15,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -67,8 +68,8 @@ def main() -> None:
     Main entry point that runs the concatenation kernel verification.
     Tests with two tensors of shapes [1500, 400] and [1500, 600].
     """
-    x = torch.randn([1500, 400], device="cuda")
-    y = torch.randn([1500, 600], device="cuda")
+    x = torch.randn([1500, 400], device=DEVICE)
+    y = torch.randn([1500, 600], device=DEVICE)
     run_example(concat2d_dim1, lambda x, y: torch.cat([x, y], dim=1), (x, y))
 
 
 
@@ -15,6 +15,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -89,8 +90,8 @@ def main() -> None:
     """
     batch_size, seq_len, vocab_size = 8, 2048, 131072
     n = batch_size * seq_len
-    logits = torch.randn(n, vocab_size, device="cuda", dtype=torch.float32)
-    labels = torch.randint(0, vocab_size, (n,), device="cuda", dtype=torch.long)
+    logits = torch.randn(n, vocab_size, device=DEVICE, dtype=torch.float32)
+    labels = torch.randint(0, vocab_size, (n,), device=DEVICE, dtype=torch.long)
 
     run_example(
         cross_entropy,
 
@@ -17,6 +17,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -88,8 +89,8 @@ def main() -> None:
     Tests with a batch of indices and an embedding table of size 16x64.
     """
     num_embeddings, embedding_dim = 16, 64
-    x = torch.randint(0, num_embeddings, [256, 32], device="cuda", dtype=torch.int32)
-    weight = torch.randn([num_embeddings, embedding_dim], device="cuda")
+    x = torch.randint(0, num_embeddings, [256, 32], device=DEVICE, dtype=torch.int32)
+    weight = torch.randn([num_embeddings, embedding_dim], device=DEVICE)
     run_example(
         embedding, torch.nn.functional.embedding, (x, weight), atol=0.0, rtol=0.0
     )
 
@@ -17,6 +17,7 @@
 import torch
 
 import helion
+from helion._testing import DEVICE
 from helion._testing import run_example
 import helion.language as hl
 
@@ -134,7 +135,7 @@ def check(n: int) -> None:
     Args:
         n: Size of the test tensor
     """
-    x = torch.randn(n, device="cuda", dtype=torch.float32, requires_grad=True)
+    x = torch.randn(n, device=DEVICE, dtype=torch.float32, requires_grad=True)
     run_example(exp, torch.exp, (x,), bwd=True)