Skip to content

Commit 99c741e

Browse files
authored
[OpenMP][amdgpu] For generic-SPMD kernels, reduce blocksize but maintain occupancy. (llvm#4270)
2 parents c97fbb0 + 0cd6b0b commit 99c741e

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1286,6 +1286,29 @@ struct AMDGPUKernelTy : public GenericKernelTy {
12861286
PreferredNumBlocks = std::min(TripCountNumBlocks, AdjustedNumBlocks);
12871287
}
12881288

1289+
// For most generic-SPMD kernels, the tripcount of the outer distribute-loop
1290+
// determines the number of teams launched. The tripcounts of the inner
1291+
// parallel loops should determine the number of threads launched. However,
1292+
// the inner loop tripcounts are unknown, so the runtime just launches 256
1293+
// threads by default. But if the inner loop tripcount is lower than 256,
1294+
// many of the threads in every workgroup are idle and just waste resources.
1295+
// In order to reduce this wastage, we reduce the blocksize upto the
1296+
// wavefront size if the tripcount is large enough to proportionally
1297+
// increase the number of teams. The increase in the number of teams is
1298+
// required to preserve the occupancy in case the inner loop tripcounts are
1299+
// larger than the blocksize. This change is done only when the user has not
1300+
// specified the number of teams or threads.
1301+
if (isGenericSPMDMode() && !IsNumThreadsFromUser &&
1302+
NumTeamsClause[0] == 0 && NumTeamsEnvVar == 0 &&
1303+
GenericDevice.getOMPXGenericSpmdUseSmallBlockSize()) {
1304+
uint64_t TmpPreferredNumBlocks = PreferredNumBlocks << 1;
1305+
while (TmpPreferredNumBlocks <= LoopTripCount &&
1306+
NumThreads > GenericDevice.getWarpSize()) {
1307+
NumThreads >>= 1;
1308+
PreferredNumBlocks = TmpPreferredNumBlocks;
1309+
TmpPreferredNumBlocks <<= 1;
1310+
}
1311+
}
12891312
return std::min(PreferredNumBlocks,
12901313
(uint64_t)GenericDevice.getBlockLimit());
12911314
}
@@ -3135,6 +3158,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
31353158
0),
31363159
OMPX_AdjustNumTeamsForXteamRedSmallBlockSize(
31373160
"LIBOMPTARGET_AMDGPU_ADJUST_XTEAM_RED_TEAMS", 1),
3161+
OMPX_GenericSpmdUseSmallBlockSize(
3162+
"LIBOMPTARGET_AMDGPU_GENERIC_SPMD_USE_SMALL_BLOCKSIZE", 0),
31383163
OMPX_MaxAsyncCopyBytes("LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES",
31393164
64 * 1024),
31403165
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
@@ -3267,6 +3292,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
32673292
getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const override {
32683293
return OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
32693294
}
3295+
virtual bool getOMPXGenericSpmdUseSmallBlockSize() const override {
3296+
return OMPX_GenericSpmdUseSmallBlockSize;
3297+
}
32703298

32713299
/// Initialize the device, its resources and get its properties.
32723300
Error initImpl(GenericPluginTy &Plugin) override {
@@ -4862,6 +4890,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
48624890
/// value should be used as the scaling factor for the number of teams.
48634891
UInt32Envar OMPX_AdjustNumTeamsForXteamRedSmallBlockSize;
48644892

4893+
/// Envar indicating whether, for generic-SPMD kernels, the blocksize should
4894+
/// be reduced and the corresponding number of teams adjusted.
4895+
BoolEnvar OMPX_GenericSpmdUseSmallBlockSize;
4896+
48654897
/// Envar specifying the maximum size in bytes where the memory copies are
48664898
/// asynchronous operations. Up to this transfer size, the memory copies are
48674899
/// asynchronous operations pushed to the corresponding stream. For larger

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
11641164
virtual uint32_t getOMPXAdjustNumTeamsForXteamRedSmallBlockSize() const {
11651165
llvm_unreachable("Unimplemented");
11661166
}
1167+
virtual bool getOMPXGenericSpmdUseSmallBlockSize() const {
1168+
llvm_unreachable("Unimplemented");
1169+
}
11671170

11681171
/// Get target compute unit kind (e.g., sm_80, or gfx908).
11691172
virtual std::string getComputeUnitKind() const { return "unknown"; }

0 commit comments

Comments
 (0)