Skip to content

Commit 4ab8dab

Browse files
authored
[AMDGPU] Add s_cluster_barrier on gfx1250 (llvm#159175)
1 parent 59e43fe commit 4ab8dab

File tree

7 files changed

+191
-9
lines changed

7 files changed

+191
-9
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,7 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
696696
//===----------------------------------------------------------------------===//
697697
// GFX1250+ only builtins.
698698
//===----------------------------------------------------------------------===//
699+
TARGET_BUILTIN(__builtin_amdgcn_s_cluster_barrier, "v", "n", "gfx1250-insts")
699700

700701
TARGET_BUILTIN(__builtin_amdgcn_flat_prefetch, "vvC*0Ii", "nc", "vmem-pref-insts")
701702
TARGET_BUILTIN(__builtin_amdgcn_global_prefetch, "vvC*1Ii", "nc", "vmem-pref-insts")

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1478,6 +1478,16 @@ void test_prefetch(generic void *fptr, global void *gptr) {
14781478
__builtin_amdgcn_global_prefetch(gptr, 8);
14791479
}
14801480

1481+
// CHECK-LABEL: @test_s_cluster_barrier(
1482+
// CHECK-NEXT: entry:
1483+
// CHECK-NEXT: call void @llvm.amdgcn.s.cluster.barrier()
1484+
// CHECK-NEXT: ret void
1485+
//
1486+
void test_s_cluster_barrier()
1487+
{
1488+
__builtin_amdgcn_s_cluster_barrier();
1489+
}
1490+
14811491
// CHECK-LABEL: @test_global_add_f32(
14821492
// CHECK-NEXT: entry:
14831493
// CHECK-NEXT: [[RETVAL:%.*]] = alloca float, align 4, addrspace(5)

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3691,6 +3691,10 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
36913691
// gfx1250 intrinsics
36923692
// ===----------------------------------------------------------------------===//
36933693

3694+
// Vanilla cluster sync-barrier
3695+
def int_amdgcn_s_cluster_barrier : ClangBuiltin<"__builtin_amdgcn_s_cluster_barrier">,
3696+
Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
3697+
36943698
// Async waits decrement ASYNCcnt and tensor waits decrement TENSORcnt which is
36953699
// modeled as InaccessibleMem.
36963700
class AMDGPUWaitAsyncIntrinsic :

llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp

Lines changed: 60 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "llvm/IR/IntrinsicInst.h"
1919
#include "llvm/IR/IntrinsicsAMDGPU.h"
2020
#include "llvm/InitializePasses.h"
21+
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
2122

2223
#define DEBUG_TYPE "amdgpu-lower-intrinsics"
2324

@@ -49,7 +50,6 @@ class AMDGPULowerIntrinsicsLegacy : public ModulePass {
4950

5051
void getAnalysisUsage(AnalysisUsage &AU) const override {
5152
AU.addRequired<TargetPassConfig>();
52-
AU.setPreservesCFG();
5353
}
5454
};
5555

@@ -73,6 +73,7 @@ bool AMDGPULowerIntrinsicsImpl::run() {
7373
case Intrinsic::amdgcn_s_barrier_signal:
7474
case Intrinsic::amdgcn_s_barrier_signal_isfirst:
7575
case Intrinsic::amdgcn_s_barrier_wait:
76+
case Intrinsic::amdgcn_s_cluster_barrier:
7677
forEachCall(F, [&](IntrinsicInst *II) { Changed |= visitBarrier(*II); });
7778
break;
7879
}
@@ -81,13 +82,14 @@ bool AMDGPULowerIntrinsicsImpl::run() {
8182
return Changed;
8283
}
8384

84-
// Optimize barriers and lower s_barrier to a sequence of split barrier
85-
// intrinsics.
85+
// Optimize barriers and lower s_(cluster_)barrier to a sequence of split
86+
// barrier intrinsics.
8687
bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
8788
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
8889
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
8990
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst ||
90-
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait);
91+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
92+
I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier);
9193

9294
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(*I.getFunction());
9395
bool IsSingleWaveWG = false;
@@ -99,7 +101,59 @@ bool AMDGPULowerIntrinsicsImpl::visitBarrier(IntrinsicInst &I) {
99101

100102
IRBuilder<> B(&I);
101103

102-
if (IsSingleWaveWG) {
104+
// Lower the s_cluster_barrier intrinsic first. There is no corresponding
105+
// hardware instruction in any subtarget.
106+
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_cluster_barrier) {
107+
// The default cluster barrier expects one signal per workgroup. So we need
108+
// a workgroup barrier first.
109+
if (IsSingleWaveWG) {
110+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_wave_barrier, {});
111+
} else {
112+
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::WORKGROUP);
113+
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::WORKGROUP);
114+
Value *IsFirst = B.CreateIntrinsic(
115+
B.getInt1Ty(), Intrinsic::amdgcn_s_barrier_signal_isfirst,
116+
{BarrierID_32});
117+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
118+
{BarrierID_16});
119+
120+
Instruction *ThenTerm =
121+
SplitBlockAndInsertIfThen(IsFirst, I.getIterator(), false);
122+
B.SetInsertPoint(ThenTerm);
123+
}
124+
125+
// Now we can signal the cluster barrier from a single wave and wait for the
126+
// barrier in all waves.
127+
Value *BarrierID_32 = B.getInt32(AMDGPU::Barrier::CLUSTER);
128+
Value *BarrierID_16 = B.getInt16(AMDGPU::Barrier::CLUSTER);
129+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_signal,
130+
{BarrierID_32});
131+
132+
B.SetInsertPoint(&I);
133+
B.CreateIntrinsic(B.getVoidTy(), Intrinsic::amdgcn_s_barrier_wait,
134+
{BarrierID_16});
135+
136+
I.eraseFromParent();
137+
return true;
138+
}
139+
140+
bool IsWorkgroupScope = false;
141+
142+
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait ||
143+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal ||
144+
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_signal_isfirst) {
145+
int BarrierID = cast<ConstantInt>(I.getArgOperand(0))->getSExtValue();
146+
if (BarrierID == AMDGPU::Barrier::TRAP ||
147+
BarrierID == AMDGPU::Barrier::WORKGROUP ||
148+
(BarrierID >= AMDGPU::Barrier::NAMED_BARRIER_FIRST &&
149+
BarrierID <= AMDGPU::Barrier::NAMED_BARRIER_LAST))
150+
IsWorkgroupScope = true;
151+
} else {
152+
assert(I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier);
153+
IsWorkgroupScope = true;
154+
}
155+
156+
if (IsWorkgroupScope && IsSingleWaveWG) {
103157
// Down-grade waits, remove split signals.
104158
if (I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier ||
105159
I.getIntrinsicID() == Intrinsic::amdgcn_s_barrier_wait) {
@@ -134,9 +188,7 @@ PreservedAnalyses AMDGPULowerIntrinsicsPass::run(Module &M,
134188
AMDGPULowerIntrinsicsImpl Impl(M, TM);
135189
if (!Impl.run())
136190
return PreservedAnalyses::all();
137-
PreservedAnalyses PA;
138-
PA.preserveSet<CFGAnalyses>();
139-
return PA;
191+
return PreservedAnalyses::none();
140192
}
141193

142194
bool AMDGPULowerIntrinsicsLegacy::runOnModule(Module &M) {

llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,7 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
360360
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
361361
switch (II->getIntrinsicID()) {
362362
case Intrinsic::amdgcn_s_barrier:
363+
case Intrinsic::amdgcn_s_cluster_barrier:
363364
case Intrinsic::amdgcn_s_barrier_signal:
364365
case Intrinsic::amdgcn_s_barrier_signal_var:
365366
case Intrinsic::amdgcn_s_barrier_signal_isfirst:

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1108,7 +1108,14 @@ enum Register_Flag : uint8_t {
11081108
namespace AMDGPU {
11091109
namespace Barrier {
11101110

1111-
enum Type { TRAP = -2, WORKGROUP = -1 };
1111+
enum Type {
1112+
CLUSTER_TRAP = -4,
1113+
CLUSTER = -3,
1114+
TRAP = -2,
1115+
WORKGROUP = -1,
1116+
NAMED_BARRIER_FIRST = 1,
1117+
NAMED_BARRIER_LAST = 16,
1118+
};
11121119

11131120
enum {
11141121
BARRIER_SCOPE_WORKGROUP = 0,
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=0 | FileCheck --check-prefixes=CHECK,NOOPT %s
3+
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize32 | FileCheck --check-prefixes=CHECK,OPT-WAVE32 %s
4+
; RUN: opt < %s -passes=amdgpu-lower-intrinsics -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -codegen-opt-level=1 -mattr=+wavefrontsize64 | FileCheck --check-prefixes=CHECK,OPT-WAVE64 %s
5+
6+
declare void @foo(i1)
7+
8+
; Verify that the explicit use of a split cluster barrier isn't optimized away.
9+
define amdgpu_kernel void @split_barriers() "amdgpu-flat-work-group-size"="32,32" {
10+
; CHECK-LABEL: define amdgpu_kernel void @split_barriers(
11+
; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
12+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
13+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
14+
; CHECK-NEXT: [[ISFIRST:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
15+
; CHECK-NEXT: call void @foo(i1 [[ISFIRST]])
16+
; CHECK-NEXT: ret void
17+
;
18+
call void @llvm.amdgcn.s.barrier.signal(i32 -3)
19+
call void @llvm.amdgcn.s.barrier.wait(i16 -3)
20+
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -3)
21+
call void @foo(i1 %isfirst)
22+
ret void
23+
}
24+
25+
define amdgpu_kernel void @s_cluster_barrier() {
26+
; CHECK-LABEL: define amdgpu_kernel void @s_cluster_barrier(
27+
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
28+
; CHECK-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
29+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
30+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
31+
; CHECK: [[BB2]]:
32+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
33+
; CHECK-NEXT: br label %[[BB3]]
34+
; CHECK: [[BB3]]:
35+
; CHECK-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
36+
; CHECK-NEXT: ret void
37+
;
38+
call void @llvm.amdgcn.s.cluster.barrier()
39+
ret void
40+
}
41+
42+
define amdgpu_kernel void @s_cluster_barrier_wg32() "amdgpu-flat-work-group-size"="32,32" {
43+
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
44+
; NOOPT-SAME: ) #[[ATTR1]] {
45+
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
46+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
47+
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
48+
; NOOPT: [[BB2]]:
49+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
50+
; NOOPT-NEXT: br label %[[BB3]]
51+
; NOOPT: [[BB3]]:
52+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
53+
; NOOPT-NEXT: ret void
54+
;
55+
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
56+
; OPT-WAVE32-SAME: ) #[[ATTR1]] {
57+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.wave.barrier()
58+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
59+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
60+
; OPT-WAVE32-NEXT: ret void
61+
;
62+
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg32(
63+
; OPT-WAVE64-SAME: ) #[[ATTR1]] {
64+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
65+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
66+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
67+
; OPT-WAVE64-NEXT: ret void
68+
;
69+
call void @llvm.amdgcn.s.cluster.barrier()
70+
ret void
71+
}
72+
73+
define amdgpu_kernel void @s_cluster_barrier_wg64() "amdgpu-flat-work-group-size"="64,64" {
74+
; NOOPT-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
75+
; NOOPT-SAME: ) #[[ATTR2:[0-9]+]] {
76+
; NOOPT-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
77+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
78+
; NOOPT-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
79+
; NOOPT: [[BB2]]:
80+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
81+
; NOOPT-NEXT: br label %[[BB3]]
82+
; NOOPT: [[BB3]]:
83+
; NOOPT-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
84+
; NOOPT-NEXT: ret void
85+
;
86+
; OPT-WAVE32-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
87+
; OPT-WAVE32-SAME: ) #[[ATTR2:[0-9]+]] {
88+
; OPT-WAVE32-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
89+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -1)
90+
; OPT-WAVE32-NEXT: br i1 [[TMP1]], label %[[BB2:.*]], label %[[BB3:.*]]
91+
; OPT-WAVE32: [[BB2]]:
92+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
93+
; OPT-WAVE32-NEXT: br label %[[BB3]]
94+
; OPT-WAVE32: [[BB3]]:
95+
; OPT-WAVE32-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
96+
; OPT-WAVE32-NEXT: ret void
97+
;
98+
; OPT-WAVE64-LABEL: define amdgpu_kernel void @s_cluster_barrier_wg64(
99+
; OPT-WAVE64-SAME: ) #[[ATTR2:[0-9]+]] {
100+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.wave.barrier()
101+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.signal(i32 -3)
102+
; OPT-WAVE64-NEXT: call void @llvm.amdgcn.s.barrier.wait(i16 -3)
103+
; OPT-WAVE64-NEXT: ret void
104+
;
105+
call void @llvm.amdgcn.s.cluster.barrier()
106+
ret void
107+
}

0 commit comments

Comments
 (0)