Note
Go to the end to download the full example code
AOT Standalone Compilation Example#
Generates a single .py file from Helion kernels that has zero Helion
dependencies at runtime — only torch, triton, and
triton.language.
Run the full AOT workflow with --standalone to produce standalone files:
python -m helion.experimental.aot_runner --standalone \
-- python examples/aot_compile_example.py
This runs collect → measure → build → evaluate, then compiles standalone
Triton code and writes examples/aot_compile_example_add_2d_standalone.py
next to this file.
The standalone file is a drop-in replacement:
# before (needs helion installed)
from examples.aot_compile_example import add_2d
# after (pure triton — no helion)
from examples.aot_compile_example_add_2d_standalone import add_2d
from __future__ import annotations
import argparse
from functools import partial
import os
import torch
from helion._testing import DEVICE
from helion._testing import do_bench
import helion.experimental
import helion.language as hl
@helion.experimental.aot_kernel()
def add_2d(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
"""Element-wise addition of two 2-D tensors."""
m, n = x.shape
out = torch.empty_like(x)
for tile_m, tile_n in hl.tile([m, n]):
out[tile_m, tile_n] = x[tile_m, tile_n] + y[tile_m, tile_n]
return out
KERNEL_BENCHMARKS = {
"add_2d": "benchmark_add_2d",
}
def benchmark_add_2d() -> None:
"""Benchmark add_2d across a range of shapes."""
shapes = [
(64, 128),
(256, 256),
(1024, 1024),
(4096, 4096),
]
print("=== add_2d kernel ===")
print(f"{'Shape':>16} {'Time (ms)':>12} {'GB/s':>10}")
print("-" * 40)
for m, n in shapes:
x = torch.randn(m, n, device=DEVICE, dtype=torch.bfloat16)
y = torch.randn(m, n, device=DEVICE, dtype=torch.bfloat16)
add_2d(x, y) # warmup
time_ms = do_bench(partial(add_2d, x, y))
assert isinstance(time_ms, float)
total_bytes = x.numel() * x.element_size() * 3 # 2 reads + 1 write
gbps = total_bytes / time_ms * 1e-6
print(f"{(m, n)!s:>16} {time_ms:>12.4f} {gbps:>10.2f}")
def main() -> None:
parser = argparse.ArgumentParser(description="AOT Compile Example")
parser.add_argument(
"--kernel",
"-k",
type=str,
action="append",
dest="kernels",
help="Kernel(s) to benchmark (can be repeated). Default: all",
)
args = parser.parse_args()
kernels = args.kernels
if kernels is None:
env_kernels = os.environ.get("HELION_AOT_KERNELS", "")
if env_kernels:
kernels = env_kernels.split(",")
aot_mode = os.environ.get("HELION_AOT_MODE", "disabled")
if aot_mode != "disabled":
print(f"AOT mode: {aot_mode}")
print()
targets = kernels or list(KERNEL_BENCHMARKS.keys())
for name in targets:
if name in KERNEL_BENCHMARKS:
globals()[KERNEL_BENCHMARKS[name]]()
else:
print(f"Unknown kernel: {name}")
if __name__ == "__main__":
main()
Total running time of the script: (0 minutes 0.000 seconds)