Add support for hl.grid(begin, end, step)

jansel · jansel · commit 499bb17f4f20 · 2025-06-20T21:18:24.000-07:00
stack-info: PR: #211, branch: jansel/stack/62
diff --git a/helion/_compiler/program_id.py b/helion/_compiler/program_id.py
@@ -119,7 +119,7 @@ def combined_device_cdiv(self, state: CodegenState) -> str:
         return " * ".join(pid.device_cdiv(state) for pid in self.pids)
 
     def combined_host_cdiv(self) -> str:
-        return " * ".join(pid.host_cdiv() for pid in self.pids)
+        return " * ".join(f"({pid.host_cdiv()})" for pid in self.pids)
 
     def codegen(self, state: CodegenState) -> None:
         pid_var = self.shared_pid_var or "tl.program_id(0)"
diff --git a/helion/_compiler/tile_strategy.py b/helion/_compiler/tile_strategy.py
@@ -440,13 +440,32 @@ def codegen_grid(self, state: CodegenState) -> DeviceGridState:
         pids = self.select_pid_strategy()
         if isinstance(state.device_function.pid, SharedProgramID):
             pids.shared_pid_var = state.device_function.pid.shared_pid_var
-        for i, (block_idx, block_size) in enumerate(
-            reversed(self._reorder([*zip(block_ids, block_sizes, strict=True)]))
+
+        assert state.ast_args is None
+        assert len(state.proxy_args) == 3
+        if state.proxy_args[1] is None:
+            begins = [0] * len(block_ids)
+        else:
+            begins = state.proxy_args[0]
+            if not isinstance(begins, (list, tuple)):
+                begins = [begins]
+            assert len(begins) == len(block_ids)
+
+        for i, (block_idx, block_size, begin) in enumerate(
+            reversed(self._reorder([*zip(block_ids, block_sizes, begins, strict=True)]))
         ):
             numel = env.block_sizes[block_idx].numel
             offset_var = self.offset_var(block_idx)
             index_var = self.index_var(block_idx)
             pid_var = device_function.new_var(f"pid_{i}", dce=True)
+
+            begin_offset_expr = ""
+            if begin != 0:
+                begin_ast = self._to_ast(begin, to_dtype=dtype)
+                begin_offset_expr = (
+                    f"{state.codegen.lift(begin_ast, dce=True, prefix='begin').id} + "
+                )
+
             if block_size != 1:
                 block_size_var = self.block_size_var(block_idx)
                 assert block_size_var is not None
@@ -457,14 +476,16 @@ def codegen_grid(self, state: CodegenState) -> DeviceGridState:
                             f"{block_size_var} = {HostFunction.current().literal_expr(block_size)}"
                         )
                     )
-                state.add_statement(f"{offset_var} = {pid_var} * {block_size_var}")
+                state.add_statement(
+                    f"{offset_var} = {begin_offset_expr}{pid_var} * {block_size_var}"
+                )
                 state.add_statement(
                     f"{index_var} = ({offset_var} + tl.arange(0, ({block_size_var}))).to({dtype})"
                 )
             else:
                 block_size_var = "1"
                 dtype = env.triton_index_type()
-                state.add_statement(f"{offset_var} = {pid_var}")
+                state.add_statement(f"{offset_var} = {begin_offset_expr}{pid_var}")
                 state.add_statement(
                     f"{index_var} = {offset_var} + tl.zeros([1], {dtype})"
                 )
@@ -509,6 +530,8 @@ def _to_ast(self, x: object, to_dtype: str | None = None) -> ast.AST:
             from .device_function import DeviceFunction
 
             return expr_from_string(DeviceFunction.current().sympy_expr(x))
+        if isinstance(x, torch.SymInt):
+            return self._to_ast(x._sympy_())
         raise NotImplementedError(f"{type(x)} is not implemented.")
 
     def codegen_device_loop(self, state: CodegenState) -> DeviceLoopState:
diff --git a/helion/_compiler/type_propagation.py b/helion/_compiler/type_propagation.py
@@ -982,29 +982,24 @@ def proxy(self) -> object:
 
     @staticmethod
     def allocate(
-        numel: int | torch.SymInt | AutoSize | None, origin: Origin
-    ) -> TileIndexType:
-        env = CompileEnvironment.current()
-        block_id = env.allocate_block_size(numel, source=LoopSpecBlockSizeSource())
-        env.config_spec.block_sizes.append(
-            BlockSizeSpec(
-                block_id=block_id,
-                size_hint=_get_hint(numel),
-            )
-        )
-        return TileIndexType(origin, block_id)
-
-    @staticmethod
-    def allocate_fixed(
         numel: int | torch.SymInt | AutoSize | None,
-        block_size: int | torch.SymInt,
         origin: Origin,
+        block_size: int | torch.SymInt | None = None,
     ) -> TileIndexType:
         env = CompileEnvironment.current()
-        return TileIndexType(
-            origin,
-            env.allocate_block_size(numel, source=FixedBlockSizeSource(block_size)),
-        )
+        if block_size is None:
+            block_id = env.allocate_block_size(numel, source=LoopSpecBlockSizeSource())
+            env.config_spec.block_sizes.append(
+                BlockSizeSpec(
+                    block_id=block_id,
+                    size_hint=_get_hint(numel),
+                )
+            )
+        else:
+            block_id = env.allocate_block_size(
+                numel, source=FixedBlockSizeSource(block_size)
+            )
+        return TileIndexType(origin, block_id)
 
     def merge(self, other: TypeInfo) -> TypeInfo:
         if isinstance(other, TileIndexType):
@@ -1024,21 +1019,30 @@ def propagate_attribute(self, attr: str, origin: AttributeOrigin) -> TypeInfo:
 class GridIndexType(SymIntType):
     block_id: int
 
-    def __init__(self, origin: Origin, sym: torch.SymInt, block_id: int) -> None:
+    def __init__(
+        self,
+        origin: Origin,
+        sym: torch.SymInt,
+        block_id: int,
+    ) -> None:
         super().__init__(origin, sym)
         self.block_id = block_id
 
     def __str__(self) -> str:  # pragma: no cover – debug helper
         return f"{type(self).__name__}({self.block_id})"
 
     @staticmethod
-    def allocate(numel: int | torch.SymInt, origin: Origin) -> GridIndexType:
+    def allocate(
+        numel: int | torch.SymInt,
+        origin: Origin,
+        step: int | torch.SymInt = 1,
+    ) -> GridIndexType:
         from .._compiler.compile_environment import CompileEnvironment
         from .host_function import HostFunction
         from .host_function import SymbolOrigin
 
         env = CompileEnvironment.current()
-        block_id = env.allocate_block_size(numel, source=FixedBlockSizeSource(1))
+        block_id = env.allocate_block_size(numel, source=FixedBlockSizeSource(step))
         # assign this a new unbacked symbol since this should be treated like a scalar rather than a tile
         sym = env.create_unbacked_symint()
         HostFunction.current().expr_to_origin[sym._sympy_()] = SymbolOrigin(
diff --git a/helion/language/loops.py b/helion/language/loops.py
@@ -200,13 +200,13 @@ def _(
         if bs is None:
             results.append(TileIndexType.allocate(size, origin))
         elif isinstance(bs, int):
-            results.append(TileIndexType.allocate_fixed(size, bs, origin))
+            results.append(TileIndexType.allocate(size, origin, bs))
         elif isinstance(bs, torch.SymInt):
             from helion._compiler.compile_environment import CompileEnvironment
 
             index = CompileEnvironment.current().get_block_id(bs)
             if index is None:
-                results.append(TileIndexType.allocate_fixed(size, bs, origin))
+                results.append(TileIndexType.allocate(size, origin, bs))
             else:
                 results.append(TileIndexType(origin=origin, block_id=index))
                 CompileEnvironment.current().block_sizes[index].mark_alternate_size(
@@ -289,63 +289,104 @@ def _codegen_loop_helper(
 @_decorators.api(
     is_device_loop=True, is_device_only=False, cache_type=True, tiles_as_sizes=True
 )
-def grid(sizes: int, /) -> Iterator[torch.SymInt]: ...
+def grid(
+    begin_or_end: int | torch.Tensor,
+    end_or_none: int | torch.Tensor | None = None,
+    /,
+    step: object = None,
+) -> Iterator[torch.SymInt]: ...
 
 
 @overload
 @_decorators.api(
     is_device_loop=True, is_device_only=False, cache_type=True, tiles_as_sizes=True
 )
-def grid(sizes: Sequence[int], /) -> Iterator[Sequence[torch.SymInt]]: ...
+def grid(
+    begin_or_end: Sequence[int | torch.Tensor],
+    end_or_none: Sequence[int | torch.Tensor] | None = None,
+    /,
+    step: object = None,
+) -> Iterator[Sequence[torch.SymInt]]: ...
 
 
 @_decorators.api(
     is_device_loop=True, is_device_only=False, cache_type=True, tiles_as_sizes=True
 )
 def grid(
-    sizes: int | Sequence[int],
+    begin_or_end: int | torch.Tensor | Sequence[int | torch.Tensor],
+    end_or_none: int | torch.Tensor | Sequence[int | torch.Tensor] | None = None,
     /,
+    step: object = None,
 ) -> Iterator[torch.SymInt] | Iterator[Sequence[torch.SymInt]]:  # type: ignore[type-arg]
-    """Iterate over *individual* indices of the given iteration space.
+    """Iterate over individual indices of the given iteration space.
 
     Semantics are equivalent to
 
-        for i in hl.tile(size, block_size=1):
+        for i in hl.tile(...):
             ...
 
     but `i` will be a scalar (`torch.SymInt`), not a 1-element tensor.
-    """
 
+    When used at the top level of a function, this becomes the grid of the kernel.
+    Otherwise, it becomes a loop in the output kernel.
+
+    Similar to `range()` there are multiple forms of this function:
+        grid(end) iterates from 0 to `end - 1`, with step size 1.
+        grid(begin, end) iterates from `begin` to `end - 1`, with step size 1.
+        grid(begin, end, step) iterates from `begin` to `end - 1`, with the given step size.
+        grid(end, step=step) iterates from 0 to `end - 1`, with the given step size.
+    """
     raise exc.NotInsideKernel
 
 
 @_decorators.type_propagation(grid)
-def _(sizes: TypeInfo, *, origin: Origin) -> TypeInfo:
+def _(
+    begin_or_end: TypeInfo,
+    end_or_none: TypeInfo | None = None,
+    /,
+    step: TypeInfo | None = None,
+    *,
+    origin: Origin,
+) -> TypeInfo:
     parent = ExtendedAST.current()[-2]
     if not isinstance(parent, ast.For):
         raise exc.LoopFunctionNotInFor("grid")
-    try:
-        proxy_sizes = sizes.proxy()
-        if not (
-            isinstance(proxy_sizes, (int, torch.SymInt))
-            or (
-                isinstance(proxy_sizes, (list, tuple))
-                and all(isinstance(x, (int, torch.SymInt)) for x in proxy_sizes)
-            )
-        ):
-            raise NotImplementedError
-    except NotImplementedError:
-        raise exc.TypeInferenceError(
-            f"grid() expected int or list[int], got {sizes!s}"
-        ) from None
+    begin, end = _normalize_begin_end(begin_or_end, end_or_none, origin=origin)
+    proxy_begin = _to_proxy(begin)
+    proxy_end = _to_proxy(end)
+    _check_matching(proxy_begin, proxy_end)
+    if _not_none(step):
+        proxy_step = Tile._tiles_to_sizes(_to_proxy(step))
+        _check_matching(proxy_end, proxy_step)
+    else:
+        proxy_step = begin.tree_map(lambda n: None)
 
-    if isinstance(proxy_sizes, (int, torch.SymInt)):
-        return IterType(origin, GridIndexType.allocate(proxy_sizes, origin))
+    if unpack := not isinstance(proxy_end, (list, tuple)):
+        proxy_begin = [proxy_begin]
+        proxy_end = [proxy_end]
+        proxy_step = [proxy_step]
+
+    results = []
+    for begin_part, end_part, step_part in zip(
+        proxy_begin, proxy_end, proxy_step, strict=True
+    ):
+        size = end_part - begin_part
+        if isinstance(size, torch.Tensor):
+            size = None  # data dependent size
+        if step_part is None:
+            step_part = 1
+        results.append(GridIndexType.allocate(size, origin, step_part))
 
-    assert isinstance(proxy_sizes, (list, tuple))
-    elements = [GridIndexType.allocate(s, origin) for s in proxy_sizes]
-    _add_config_choices([x.block_id for x in elements])
-    return IterType(origin, SequenceType(origin, elements))
+    _add_config_choices(
+        [x.block_id for x in results],
+        is_tile=False,
+        has_begin=not all((isinstance(x, int) and x == 0) for x in proxy_begin),
+    )
+    if unpack:
+        (result,) = results
+    else:
+        result = SequenceType(origin, results)
+    return IterType(origin, result)
 
 
 @_decorators.codegen(grid)
diff --git a/test/test_grid.py b/test/test_grid.py