up

yf225 · yf225 · commit 0e4cb603ffd3 · 2025-12-05T17:35:08.000-08:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -112,6 +112,7 @@ def __init__(
             collections.Counter()
         )
         self.specialized_vars: set[sympy.Symbol] = set()
+        self.specialized_strides: set[tuple[str, int]] = set()
         self.loop_dependency_checker = LoopDependencyChecker()
         self._symint_cache: dict[object, torch.SymInt] = {}
         self.device_load_count = (
diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -14,6 +14,7 @@
 
 import sympy
 import torch
+from torch._dynamo.source import LocalSource
 from torch._inductor.codegen.triton import TritonPrinter
 from torch.fx.graph import _Namespace
 
@@ -602,11 +603,23 @@ def tensor_size(self, fake_value: torch.Tensor, dim: int) -> Argument:
         return self._tensor_property(TensorSizeArg, fake_value, dim, "size")
 
     def tensor_stride(self, fake_value: torch.Tensor, dim: int) -> Argument:
+        v = fake_value.stride(dim)
+        env = CompileEnvironment.current()
+        # Check if this stride was explicitly specialized
+        source = env.input_sources.get(fake_value)
         if (
-            isinstance(v := fake_value.stride(dim), int)
-            and CompileEnvironment.current().settings.static_shapes
+            isinstance(source, LocalSource)
+            and (source.local_name, dim) in env.specialized_strides
         ):
-            return StaticShape(v)
+            return StaticShape(int(v))
+        if isinstance(v, int):
+            if env.settings.static_shapes:
+                return StaticShape(v)
+        else:
+            # Check if all free symbols are specialized
+            syms = v._sympy_().free_symbols
+            if syms and syms <= env.specialized_vars:
+                return StaticShape(int(v))
         return self._tensor_property(TensorStrideArg, fake_value, dim, "stride")
 
     def sorted_args(self) -> list[Argument]:
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -645,7 +645,7 @@ def propagate_call(
         attr = self.attr()
         if attr in {"dim", "ndimension"} and not (args or kwargs):
             return TypeInfo.from_example(self.tensor.fake_value.ndim, origin)
-        if attr in {"shape", "size"} and not kwargs:
+        if attr in {"shape", "size", "stride"} and not kwargs:
             fn = getattr(self.tensor.fake_value, attr)
             try:
                 return TypeInfo.from_example(
diff --git a/helion/exc.py b/helion/exc.py
@@ -186,7 +186,7 @@ class SpecializeOnDevice(BaseError):
 
 
 class SpecializeArgType(BaseError):
-    message = "hl.specialize() must be called on a size from an input tensor, got: {}"
+    message = "hl.specialize() must be called on a size or stride from an input tensor, got: {}"
 
 
 class StackTensorcOnHost(BaseError):
diff --git a/helion/language/constexpr.py b/helion/language/constexpr.py
@@ -6,6 +6,9 @@
 from typing_extensions import TypeVar
 
 import torch
+from torch._dynamo.source import LocalSource
+from torch._dynamo.source import TensorProperty
+from torch._dynamo.source import TensorPropertySource
 
 from .. import exc
 from .._compiler.ast_extension import expr_from_string
@@ -87,7 +90,18 @@ def _(value: TypeInfo, *, origin: Origin) -> TypeInfo:
     env = CompileEnvironment.current()
 
     def handle_symint(symint: torch.SymInt) -> int:
-        env.specialized_vars.update(symint._sympy_().free_symbols)
+        syms = symint._sympy_().free_symbols
+        env.specialized_vars.update(syms)
+        # Track stride specializations
+        for sym in syms:
+            for source in env.shape_env.var_to_sources.get(sym, []):
+                if (
+                    isinstance(source, TensorPropertySource)
+                    and source.prop == TensorProperty.STRIDE
+                    and isinstance(source.base, LocalSource)
+                    and source.idx is not None
+                ):
+                    env.specialized_strides.add((source.base.local_name, source.idx))
         return symint.__int__()
 
     specialized = _convert_specializable(proxy, on_symint=handle_symint)
diff --git a/helion/runtime/kernel.py b/helion/runtime/kernel.py
@@ -624,12 +624,14 @@ def _specialize_extra(self) -> list[Callable[[Sequence[object]], Hashable]]:
 
         def make_extractor(v: Source) -> Callable[[Sequence[object]], Hashable]:
             if isinstance(v, TensorPropertySource):
-                assert v.prop == TensorProperty.SIZE
                 index = v.idx
                 assert index is not None
                 inner = make_extractor(v.base)
-
-                return lambda args: cast("torch.Tensor", inner(args)).size(index)
+                if v.prop == TensorProperty.SIZE:
+                    return lambda args: cast("torch.Tensor", inner(args)).size(index)
+                if v.prop == TensorProperty.STRIDE:
+                    return lambda args: cast("torch.Tensor", inner(args)).stride(index)
+                raise exc.SpecializeArgType(v)
             if isinstance(v, LocalSource):
                 index = arg_name_to_index[v.local_name]
                 return operator.itemgetter(index)
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -1674,18 +1674,18 @@ def _helion_fused_linear_jsd_kernel(student_logits, teacher_logits, loss, temper
     # src[fused_linear_jsd.py:N]: teacher_div = torch.nn.functional.kl_div(
     # src[fused_linear_jsd.py:N]:     torch.log(m), teacher_prob, reduction="none", log_target=True
     # src[fused_linear_jsd.py:N]: ).sum(dim=-1)
-    v_17 = teacher_prob_1 - v_16
-    v_18 = libdevice.exp(teacher_prob_1)
-    v_19 = v_18 * v_17
+    v_17 = libdevice.exp(teacher_prob_1)
+    v_18 = teacher_prob_1 - v_16
+    v_19 = v_17 * v_18
     teacher_div = tl.cast(tl.sum(v_19, 1), tl.float32)
     # src[fused_linear_jsd.py:N]: torch.log(m), student_prob, reduction="none", log_target=True
     v_20 = tl_math.log(v_15)
     # src[fused_linear_jsd.py:N]: student_div = torch.nn.functional.kl_div(
     # src[fused_linear_jsd.py:N]:     torch.log(m), student_prob, reduction="none", log_target=True
     # src[fused_linear_jsd.py:N]: ).sum(dim=-1)
-    v_21 = student_prob_1 - v_20
-    v_22 = libdevice.exp(student_prob_1)
-    v_23 = v_22 * v_21
+    v_21 = libdevice.exp(student_prob_1)
+    v_22 = student_prob_1 - v_20
+    v_23 = v_21 * v_22
     student_div = tl.cast(tl.sum(v_23, 1), tl.float32)
     # src[fused_linear_jsd.py:N]: batch_loss = student_div + beta * (teacher_div - student_div)
     v_24 = teacher_div - student_div

Original file line number	Diff line number	Diff line change
`@@ -112,6 +112,7 @@ def __init__(`
`112`	`112`	`collections.Counter()`
`113`	`113`	`)`
`114`	`114`	`self.specialized_vars: set[sympy.Symbol] = set()`
	`115`	`+ self.specialized_strides: set[tuple[str, int]] = set()`
`115`	`116`	`self.loop_dependency_checker = LoopDependencyChecker()`
`116`	`117`	`self._symint_cache: dict[object, torch.SymInt] = {}`
`117`	`118`	`self.device_load_count = (`