Autotuning Progress Bar (#739)

msaroufim · web-flow · commit 8338452ddd1d · 2025-10-01T13:55:02.000-07:00
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -29,6 +29,7 @@
 import torch.multiprocessing as mp
 from torch.utils._pytree import tree_flatten
 from torch.utils._pytree import tree_map
+from tqdm.auto import tqdm
 from triton.testing import do_bench
 
 from .. import exc
@@ -295,13 +296,14 @@ def extract_launcher(
         )
 
     def parallel_benchmark(
-        self, configs: list[Config]
+        self, configs: list[Config], *, desc: str = "Benchmarking"
     ) -> list[tuple[Config, Callable[..., object], float]]:
         """
         Benchmark multiple configurations in parallel.
 
         Args:
             configs: A list of configurations to benchmark.
+            desc: Description for the progress bar.
 
         Returns:
             A list of tuples containing configurations and their performance.
@@ -319,7 +321,16 @@ def parallel_benchmark(
         else:
             is_workings = [True] * len(configs)
         results = []
-        for config, fn, is_working in zip(configs, fns, is_workings, strict=True):
+        iterator = zip(configs, fns, is_workings, strict=True)
+        if self.settings.autotune_progress_bar:
+            iterator = tqdm(
+                iterator,
+                total=len(configs),
+                desc=desc,
+                unit="config",
+                disable=not self.settings.autotune_progress_bar,
+            )
+        for config, fn, is_working in iterator:
             if is_working:
                 # benchmark one-by-one to avoid noisy results
                 results.append((config, fn, self.benchmark_function(config, fn)))
@@ -479,13 +490,19 @@ def make_unbenchmarked(self, flat_values: FlatConfig) -> PopulationMember:
         return PopulationMember(_unset_fn, [], flat_values, config)
 
     def parallel_benchmark_population(
-        self, members: list[PopulationMember]
+        self, members: list[PopulationMember], *, desc: str = "Benchmarking"
     ) -> list[PopulationMember]:
         """
         Benchmark multiple population members in parallel.  Members should be created with make_unbenchmarked.
+
+        Args:
+            members: The list of population members to benchmark.
+            desc: Description for the progress bar.
         """
         for member, (config_out, fn, perf) in zip(
-            members, self.parallel_benchmark([m.config for m in members]), strict=True
+            members,
+            self.parallel_benchmark([m.config for m in members], desc=desc),
+            strict=True,
         ):
             assert config_out is member.config
             member.perfs.append(perf)
@@ -523,30 +540,45 @@ def should_rebenchmark(self, member: PopulationMember) -> bool:
             and math.isfinite(member.perf)
         )
 
-    def rebenchmark(self, members: list[PopulationMember]) -> None:
+    def rebenchmark(
+        self, members: list[PopulationMember], *, desc: str = "Rebenchmarking"
+    ) -> None:
         """
         Re-benchmark a list of population members to avoid outliers.
+
+        Args:
+            members: The list of population members to rebenchmark.
+            desc: Description for the progress bar.
         """
         if len(members) < 2:
             return
         repeat = max(3, int(200 / self.best_perf_so_far))
-        new_timings = interleaved_bench(
-            [functools.partial(m.fn, *self.args) for m in members], repeat=repeat
-        )
+        iterator = [functools.partial(m.fn, *self.args) for m in members]
+        if self.settings.autotune_progress_bar:
+            new_timings = interleaved_bench(iterator, repeat=repeat, desc=desc)
+        else:
+            new_timings = interleaved_bench(iterator, repeat=repeat)
         for m, t in zip(members, new_timings, strict=True):
             m.perfs.append(t)
             if t < self.best_perf_so_far:
                 self.best_perf_so_far = t
 
     def rebenchmark_population(
-        self, members: list[PopulationMember] | None = None
+        self,
+        members: list[PopulationMember] | None = None,
+        *,
+        desc: str = "Rebenchmarking",
     ) -> None:
         """
         Re-benchmark the entire population to avoid outliers.
+
+        Args:
+            members: The list of population members to rebenchmark.
+            desc: Description for the progress bar.
         """
         if members is None:
             members = self.population
-        self.rebenchmark([p for p in members if self.should_rebenchmark(p)])
+        self.rebenchmark([p for p in members if self.should_rebenchmark(p)], desc=desc)
 
     def statistics(self) -> str:
         """
diff --git a/helion/autotuner/benchmarking.py b/helion/autotuner/benchmarking.py
@@ -4,14 +4,22 @@
 import statistics
 from typing import Callable
 
+from tqdm.auto import tqdm
 from triton import runtime
 
 
-def interleaved_bench(fns: list[Callable[[], object]], *, repeat: int) -> list[float]:
+def interleaved_bench(
+    fns: list[Callable[[], object]], *, repeat: int, desc: str | None = None
+) -> list[float]:
     """
     Benchmark multiple functions at once, interleaving their executions to reduce
     the impact of external factors (e.g., load, temperature) on the
     measurements.
+
+    Args:
+        fns: List of functions to benchmark
+        repeat: Number of times to repeat each benchmark
+        desc: Optional description for progress bar
     """
     # warmup
     for fn in fns:
@@ -30,7 +38,10 @@ def interleaved_bench(fns: list[Callable[[], object]], *, repeat: int) -> list[f
     ]
 
     di.synchronize()
-    for i in range(repeat):
+    iterator = range(repeat)
+    if desc is not None:
+        iterator = tqdm(iterator, desc=desc, total=repeat, unit="round")
+    for i in iterator:
         for j in range(len(fns)):
             clear_cache()
             start_events[j][i].record()
diff --git a/helion/autotuner/finite_search.py b/helion/autotuner/finite_search.py
@@ -35,7 +35,9 @@ def __init__(
     def _autotune(self) -> Config:
         best_config = None
         best_time = float("inf")
-        for config, _fn, time in self.parallel_benchmark(self.configs):
+        for config, _fn, time in self.parallel_benchmark(
+            self.configs, desc="Benchmarking"
+        ):
             if time < best_time:
                 best_time = time
                 best_config = config
diff --git a/helion/autotuner/pattern_search.py b/helion/autotuner/pattern_search.py
@@ -57,9 +57,9 @@ def _autotune(self) -> Config:
             if member.config not in visited:
                 visited.add(member.config)
                 self.population.append(member)
-        self.parallel_benchmark_population(self.population)
+        self.parallel_benchmark_population(self.population, desc="Initial population")
         # again with higher accuracy
-        self.rebenchmark_population(self.population)
+        self.rebenchmark_population(self.population, desc="Initial rebench")
         self.population.sort(key=performance)
         starting_points = []
         for member in self.population[: self.copies]:
@@ -90,11 +90,15 @@ def _autotune(self) -> Config:
                 break
             self.population = [*new_population.values()]
             # compile any unbenchmarked members in parallel
-            self.parallel_benchmark_population(
-                [m for m in self.population if len(m.perfs) == 0]
-            )
+            unbenchmarked = [m for m in self.population if len(m.perfs) == 0]
+            if unbenchmarked:
+                self.parallel_benchmark_population(
+                    unbenchmarked, desc=f"Gen {generation} neighbors"
+                )
             # higher-accuracy rebenchmark
-            self.rebenchmark_population(self.population)
+            self.rebenchmark_population(
+                self.population, desc=f"Gen {generation} rebench"
+            )
             self.log(
                 f"Generation {generation}, {num_neighbors} neighbors, {num_active} active:",
                 self.statistics,
diff --git a/helion/runtime/settings.py b/helion/runtime/settings.py
@@ -111,6 +111,9 @@ class _Settings:
     autotune_rebenchmark_threshold: float = float(
         os.environ.get("HELION_REBENCHMARK_THRESHOLD", "1.5")
     )
+    autotune_progress_bar: bool = (
+        os.environ.get("HELION_AUTOTUNE_PROGRESS_BAR", "1") == "1"
+    )
     print_output_code: bool = os.environ.get("HELION_PRINT_OUTPUT_CODE", "0") == "1"
     force_autotune: bool = os.environ.get("HELION_FORCE_AUTOTUNE", "0") == "1"
     allow_warp_specialize: bool = (
@@ -142,6 +145,7 @@ class Settings(_Settings):
         "autotune_random_seed": "Seed used for autotuner random number generation. Defaults to HELION_AUTOTUNE_RANDOM_SEED or a time-based seed.",
         "autotune_accuracy_check": "If True, validate candidate configs against the baseline kernel output before accepting them during autotuning.",
         "autotune_rebenchmark_threshold": "If a config is within threshold*best_perf, re-benchmark it to avoid outliers. Default is 1.5x.  Set to <1 to disable.",
+        "autotune_progress_bar": "If True, show progress bar during autotuning. Default is True. Set HELION_AUTOTUNE_PROGRESS_BAR=0 to disable.",
         "print_output_code": "If True, print the output code of the kernel to stderr.",
         "force_autotune": "If True, force autotuning even if a config is provided.",
         "allow_warp_specialize": "If True, allow warp specialization for tl.range calls on CUDA devices.",
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,7 +20,8 @@ dependencies = [
     "torch>=2.7.0",
     "typing_extensions>=4.0.0",
     "filecheck",
-    "psutil"
+    "psutil",
+    "tqdm"
 ]
 
 [project.optional-dependencies]
diff --git a/requirements.txt b/requirements.txt
@@ -4,3 +4,4 @@ pre-commit
 filecheck
 expecttest
 numpy
+tqdm

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,8 @@ dependencies = [`
`20`	`20`	`"torch>=2.7.0",`
`21`	`21`	`"typing_extensions>=4.0.0",`
`22`	`22`	`"filecheck",`
`23`		`- "psutil"`
	`23`	`+ "psutil",`
	`24`	`+ "tqdm"`
`24`	`25`	`]`
`25`	`26`
`26`	`27`	`[project.optional-dependencies]`