halide · antonysigma · Jul 10, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 16, 2025
diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
@@ -14,21 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 
+set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal|opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _bgu_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
-# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
-#
-# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
-#
-# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
-#
-# Uncomment the following code to debug. PARAMS
-# autoscheduler.experimental_gpu_schedule=1
-)
+                   PARAMS ${_bgu_autoscheduler})
 
 # Main executable
 add_executable(bgu_filter filter.cpp)

diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
@@ -16,11 +16,29 @@ add_halide_generator(camera_pipe.generator
                      SOURCES camera_pipe_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
+set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(Halide_TARGET MATCHES "opencl|vulkan")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(camera_pipe FROM camera_pipe.generator)
 add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
                    GENERATOR camera_pipe
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS ${_camera_pipe_autoscheduler_params})
 
 # Main executable
 add_executable(camera_pipe_process process.cpp)

diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(harris.generator SOURCES harris_generator.cpp)
 
+set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "opencl|metal|cuda|vulkan")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _harris_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_harris_autoscheduler_params})
 
 # Main executable
 add_executable(harris_filter filter.cpp)

diff --git a/apps/harris/harris_generator.cpp b/apps/harris/harris_generator.cpp
@@ -66,7 +66,7 @@ class Harris : public Halide::Generator<Harris> {
             const int kHeight = 2560;
             input.dim(0).set_estimate(0, kWidth);
             input.dim(1).set_estimate(0, kHeight);
-            input.dim(2).set_estimate(0, 3);
+            input.dim(2).set_estimate(0, 4);
             output.dim(0).set_estimate(3, kWidth - 6);
             output.dim(1).set_estimate(3, kHeight - 6);
         }

diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
@@ -32,18 +32,15 @@ target_link_libraries(iir_blur_filter PRIVATE
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "opencl")
-        # Error message:
-        #
-        # Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed
-        message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.")
-    else ()
-        configure_file(${IMAGE} rgb.png COPYONLY)
-        add_test(NAME iir_blur_filter
-                COMMAND iir_blur_filter rgb.png out.png)
-        set_tests_properties(iir_blur_filter PROPERTIES
-                            LABELS iir_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb.png COPYONLY)
+    add_test(NAME iir_blur_filter
+            COMMAND iir_blur_filter rgb.png out.png)
+    set_tests_properties(iir_blur_filter PROPERTIES
+                        LABELS iir_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+                        # Pass in the keyword "metal" etc to skip the test
+                        # explicitly. Buildbot can print a nice test report
+                        # for all skipped tests.
+                        ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 endif ()
diff --git a/apps/iir_blur/filter.cpp b/apps/iir_blur/filter.cpp
@@ -1,6 +1,7 @@
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
+#include <regex>
 
 #include "HalideBuffer.h"
 #include "HalideRuntime.h"
@@ -13,12 +14,54 @@
 
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    USING_METAL_OR_OPENCL,
+    NOT_METAL_OR_OPENCL,
+    ENV_VARIABLE_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = std::getenv("HL_TARGET");
+    if (hl_target == nullptr) {
+        printf("Warning: Environment variable HL_TARGET not specified. "
+               "Proceeding to the tests...\n");
+        return ENV_VARIABLE_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"metal|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: clFinish timeout.
+        //
+        // Metal: copy_to_host() failed. Error
+        // Domain=MTLCommandBufferErrorDomain Code=2 "Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)"
+        // UserInfo={NSLocalizedDescription=Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)}
+        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
+               "generates copy_to_host() function calls that timeout. "
+               "Target = %s. Skipping...\n",
+               hl_target);
+
+        return USING_METAL_OR_OPENCL;
+    }
+
+    return NOT_METAL_OR_OPENCL;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc != 3) {
         printf("Usage: %s in out\n", argv[0]);
         return 1;
     }
 
+    if (ensure_cuda_device() == USING_METAL_OR_OPENCL) {
+        return 0;
+    }
+
     Halide::Runtime::Buffer<float, 3> input = load_and_convert_image(argv[1]);
     Halide::Runtime::Buffer<float, 3> output(input.width(), input.height(), input.channels());
 

diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
@@ -14,12 +14,29 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp)
 
+set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(Halide_TARGET MATCHES "opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_lens_blur_autoscheduler_params})
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
@@ -32,26 +49,11 @@ target_link_libraries(lens_blur_filter
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "metal")
-        # Note(antonysigma): Buildbot error message:
-        #
-        # 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
-        # Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
-        # failed assertion `(threadsPerThreadgroup.width(32) *
-        # threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
-        # be <= 896. (kernel threadgroup size limit)'
-        #
-        # Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
-        # hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
-        # threads per block because of the register pressure in lens_blur's GPU kernel.
-        message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
-    else ()
-        configure_file(${IMAGE} rgb_small.png COPYONLY)
-        add_test(NAME lens_blur_filter
-                COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
-        set_tests_properties(lens_blur_filter PROPERTIES
-                            LABELS lens_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb_small.png COPYONLY)
+    add_test(NAME lens_blur_filter
+            COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
+    set_tests_properties(lens_blur_filter PROPERTIES
+                        LABELS lens_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -16,15 +16,24 @@ add_halide_generator(local_laplacian.generator
                      SOURCES local_laplacian_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
+set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|metal|opencl|vulkan")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _local_laplacian_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+endif()
+
 # Filters
 add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    GENERATOR local_laplacian
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
-                   )
+                   PARAMS ${_local_laplacian_autoscheduler_params})
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)
@@ -42,5 +51,9 @@ if (EXISTS ${IMAGE})
     set_tests_properties(local_laplacian_process PROPERTIES
                          LABELS local_laplacian
                          PASS_REGULAR_EXPRESSION "Success!"
-                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
+                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+                         # Pass in the keyword "metal" etc to skip the test
+                         # explicitly. Buildbot can print a nice test report
+                         # for all skipped tests.
+                         ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 endif ()
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -97,6 +97,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Provide estimates on the pipeline output
         output.set_estimates({{0, 1536}, {0, 2560}, {0, 3}});
 
+        // Hardcode the input and output dimensions to suppress the OpenCL/Metal
+        // launch failure:
+        //
+        //  OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
+        //  failed
+        input.dim(0).set_bounds(0, 1536).set_stride(1);
+        input.dim(1).set_bounds(0, 2560).set_stride(1536);
+        input.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
+
+        output.dim(0).set_bounds(0, 1536).set_stride(1);
+        output.dim(1).set_bounds(0, 2560).set_stride(1536);
+        output.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
+
         /* THE SCHEDULE */
         if (using_autoscheduler()) {
             // Nothing.

diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
@@ -1,5 +1,7 @@
 #include <chrono>
 #include <cstdio>
+#include <cstdlib>
+#include <regex>
 
 #include "local_laplacian.h"
 #ifndef NO_AUTO_SCHEDULE
@@ -13,13 +15,59 @@
 using namespace Halide::Runtime;
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    IS_CUDA,
+    NOT_CUDA,
+    ENV_VARIABLE_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = std::getenv("HL_TARGET");
+    if (hl_target == nullptr) {
+        printf("Warning: Environment variable HL_TARGET not specified. "
+               "Proceeding to the tests...\n");
+        return ENV_VARIABLE_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"metal|vulkan|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
+        // failed
+        //
+        // 2025-07-17 17:24:32.170 local_laplacian_process[63513:6587844] Metal
+        // API Validation Enabled -[MTLDebugComputeCommandEncoder
+        // _validateThreadsPerThreadgroup:]:1266: failed assertion
+        // `(threadsPerThreadgroup.width(62) * threadsPerThreadgroup.height(32)
+        // * threadsPerThreadgroup.depth(1))(1984) must be <= 1024. (device
+        // threadgroup size limit)'
+        //
+        // Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
+        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
+               "generates the gpu_threads where thread count per block "
+               "is not an multiple of 32. Target = %s. Skipping...\n",
+               hl_target);
+
+        return NOT_CUDA;
+    }
+
+    return IS_CUDA;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc < 7) {
         printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n"
                "e.g.: ./process input.png 8 1 1 10 output.png\n");
         return 1;
     }
 
+    if (ensure_cuda_device() == NOT_CUDA) {
+        return 0;
+    }
+
     // Input may be a PNG8
     Buffer<uint16_t, 3> input = load_and_convert_image(argv[1]);