@@ -1286,6 +1286,29 @@ struct AMDGPUKernelTy : public GenericKernelTy {
1286
1286
PreferredNumBlocks = std::min (TripCountNumBlocks, AdjustedNumBlocks);
1287
1287
}
1288
1288
1289
+ // For most generic-SPMD kernels, the tripcount of the outer distribute-loop
1290
+ // determines the number of teams launched. The tripcounts of the inner
1291
+ // parallel loops should determine the number of threads launched. However,
1292
+ // the inner loop tripcounts are unknown, so the runtime just launches 256
1293
+ // threads by default. But if the inner loop tripcount is lower than 256,
1294
+ // many of the threads in every workgroup are idle and just waste resources.
1295
+ // In order to reduce this wastage, we reduce the blocksize upto the
1296
+ // wavefront size if the tripcount is large enough to proportionally
1297
+ // increase the number of teams. The increase in the number of teams is
1298
+ // required to preserve the occupancy in case the inner loop tripcounts are
1299
+ // larger than the blocksize. This change is done only when the user has not
1300
+ // specified the number of teams or threads.
1301
+ if (isGenericSPMDMode () && !IsNumThreadsFromUser &&
1302
+ NumTeamsClause[0 ] == 0 && NumTeamsEnvVar == 0 &&
1303
+ GenericDevice.getOMPXGenericSpmdUseSmallBlockSize ()) {
1304
+ uint64_t TmpPreferredNumBlocks = PreferredNumBlocks << 1 ;
1305
+ while (TmpPreferredNumBlocks <= LoopTripCount &&
1306
+ NumThreads > GenericDevice.getWarpSize ()) {
1307
+ NumThreads >>= 1 ;
1308
+ PreferredNumBlocks = TmpPreferredNumBlocks;
1309
+ TmpPreferredNumBlocks <<= 1 ;
1310
+ }
1311
+ }
1289
1312
return std::min (PreferredNumBlocks,
1290
1313
(uint64_t )GenericDevice.getBlockLimit ());
1291
1314
}
@@ -3135,6 +3158,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3135
3158
0 ),
3136
3159
OMPX_AdjustNumTeamsForXteamRedSmallBlockSize (
3137
3160
" LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS" , 1 ),
3161
+ OMPX_GenericSpmdUseSmallBlockSize (
3162
+ " LIBOMPTARGET_AMDGPU_GENERIC_SPMD_USE_SMALL_BLOCKSIZE" , 0 ),
3138
3163
OMPX_MaxAsyncCopyBytes (" LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES" ,
3139
3164
64 * 1024 ),
3140
3165
OMPX_InitialNumSignals (" LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS" ,
@@ -3267,6 +3292,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
3267
3292
getOMPXAdjustNumTeamsForXteamRedSmallBlockSize () const override {
3268
3293
return OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
3269
3294
}
3295
+ virtual bool getOMPXGenericSpmdUseSmallBlockSize () const override {
3296
+ return OMPX_GenericSpmdUseSmallBlockSize;
3297
+ }
3270
3298
3271
3299
// / Initialize the device, its resources and get its properties.
3272
3300
Error initImpl (GenericPluginTy &Plugin) override {
@@ -4862,6 +4890,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
4862
4890
// / value should be used as the scaling factor for the number of teams.
4863
4891
UInt32Envar OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
4864
4892
4893
+ // / Envar indicating whether, for generic-SPMD kernels, the blocksize should
4894
+ // / be reduced and the corresponding number of teams adjusted.
4895
+ BoolEnvar OMPX_GenericSpmdUseSmallBlockSize;
4896
+
4865
4897
// / Envar specifying the maximum size in bytes where the memory copies are
4866
4898
// / asynchronous operations. Up to this transfer size, the memory copies are
4867
4899
// / asynchronous operations pushed to the corresponding stream. For larger
0 commit comments