Limit the GPU threads per block

antonysigma · antonysigma · commit ee44c3b1171e · 2025-07-11T12:16:45.000-07:00
When nested_parallelism is enabled, ensure that the GPU thread count is
either 1024 or the user-provided `autoscheduler.parallelism` value,
whichever is smaller.

The immediate effect is that when `f.compute_at(g, xo)`, the allocated
GPU shared memory is not multiplied by the factor of nested parallelism,
exceeding the Mullapudi2016's original optimal cache size estimate.
diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 
+set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
+    # When target=host-cuda or host-metal, set last_level_cache per GPU block
+    # eliminates all `.compute_at` in the generated schedules, which eliminates
+    # all GPU shared memory allocations.
+    list(APPEND _bgu_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_bgu_autoscheduler})
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
@@ -32,26 +32,11 @@ target_link_libraries(lens_blur_filter
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "metal")
-        # Note(antonysigma): Buildbot error message:
-        #
-        # 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
-        # Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
-        # failed assertion `(threadsPerThreadgroup.width(32) *
-        # threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
-        # be <= 896. (kernel threadgroup size limit)'
-        #
-        # Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
-        # hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
-        # threads per block because of the register pressure in lens_blur's GPU kernel.
-        message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
-    else ()
-        configure_file(${IMAGE} rgb_small.png COPYONLY)
-        add_test(NAME lens_blur_filter
-                COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
-        set_tests_properties(lens_blur_filter PROPERTIES
-                            LABELS lens_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb_small.png COPYONLY)
+    add_test(NAME lens_blur_filter
+            COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
+    set_tests_properties(lens_blur_filter PROPERTIES
+                        LABELS lens_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -23,7 +23,7 @@ add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.experimental_gpu_schedule=1
                    )
 
 # Main executable
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
@@ -14,14 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp)
 
+set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
+    # When target=host-cuda or host-metal, set last_level_cache per GPU block
+    # eliminates all `.compute_at` in the generated schedules, which eliminates
+    # all GPU shared memory allocations.
+    list(APPEND _stencil_chain_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   # When target=host-cuda or host-metal, limit the GPU shared
-                   # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS ${_stenctil_chain_autoscheduler_params}
                    )
 
 # Main executable
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1368,7 +1368,7 @@ class GPUTilingDedup {
     }
 
     /** Generate Halide GPU schedules. */
-    void apply(AutoSchedule &sched) {
+    void apply(AutoSchedule &sched, const Expr& parallelism) {
         if (!ordering.empty() && !is_initial_order) {
             std::set<std::string> var_list;
             for (const auto &v : ordering) {
@@ -1396,7 +1396,7 @@ class GPUTilingDedup {
         }
 
         GPUTileHelper helper{f, stage_num};
-        Expr threads_budget = max_n_threads;
+        Expr threads_budget = min(parallelism, max_n_threads);
 
         // Maximize GPU thread occupancy with the grid-stride loop.
         //
@@ -1423,22 +1423,22 @@ class GPUTilingDedup {
 
             const auto &[var, entry] = *iter;
 
-            const bool should_unroll = can_prove(entry.factor <= 1);
-            if (should_unroll) {
-                // Skip thread size of 1.
-                continue;
-            }
+            //const bool should_unroll = can_prove(entry.factor <= 1);
+            //if (should_unroll) {
+            //    // Skip thread size of 1.
+            //    continue;
+            //}
 
             split_info new_entry{entry};
-            new_entry.factor = 1;
+            new_entry.factor = simplify(min(threads_budget, entry.factor));
 
             const bool can_split = helper.try_split(new_entry);
             if (!can_split) {
                 // If more than 3 gpu_blocks are defined, mark the current loop as the for-loop.
                 parallelize.erase(iter);
                 continue;
             }
-            threads_budget = simplify(max(threads_budget / entry.factor, 1));
+            threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
         }
 
         helper.commit(sched, is_compute_at);
@@ -2210,7 +2210,7 @@ Partitioner::find_best_tile_config(const Group &g) {
     Group no_tile = g;
     no_tile.tile_sizes = no_tile_config;
 
-    bool show_analysis = false;
+    constexpr bool show_analysis = false;
     GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis);
 
     GroupAnalysis best_analysis = no_tile_analysis;
@@ -2233,7 +2233,7 @@ Partitioner::find_best_tile_config(const Group &g) {
         Expr benefit = estimate_benefit(best_analysis, new_analysis,
                                         no_redundant_work, true);
 
-        if (show_analysis) {
+        if constexpr (show_analysis) {
             debug(0) << "Benefit relative to not tiling:" << benefit << "\n";
             debug(0) << "Best analysis:" << new_analysis;
             debug(0) << "No tile analysis:" << no_tile_analysis;
@@ -3439,7 +3439,8 @@ void Partitioner::generate_group_cpu_schedule(
                     }
                 }
                 if (arch_params.is_gpu_schedule) {
-                    auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
+                    const Expr gpu_threads = simplify(min(iter->second, arch_params.parallelism / def_par));
+                    auto parallelized_split = gpu_tiling.can_parallelize(v, gpu_threads);
                     if (parallelized_split) {
                         auto split_vars = *parallelized_split;
                         inner_dims.emplace_back(split_vars.inner);
@@ -3463,7 +3464,7 @@ void Partitioner::generate_group_cpu_schedule(
     }
 
     if (arch_params.is_gpu_schedule) {
-        gpu_tiling.apply(sched);
+        gpu_tiling.apply(sched, arch_params.parallelism);
     }
 
     // Find the level at which group members will be computed.
@@ -3552,7 +3553,7 @@ void Partitioner::generate_group_cpu_schedule(
                         mem_rvars, mem_estimates, sched, gpu_tiling2);
 
         if (arch_params.is_gpu_schedule) {
-            gpu_tiling2.apply(sched);
+            gpu_tiling2.apply(sched, arch_params.parallelism);
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator`
`23`	`23`	`AUTOSCHEDULER Halide::Mullapudi2016`
`24`	`24`	`# When target=host-cuda or host-metal, limit the GPU shared`
`25`	`25`	`# memory per block to avoid gpu kernel launch failure.`
`26`		`- PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1`
	`26`	`+ PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.experimental_gpu_schedule=1`
`27`	`27`	`)`
`28`	`28`
`29`	`29`	`# Main executable`