Skip to content

Commit a77249b

Browse files
kuharrkayaith
authored andcommitted
Revert "AMDGPU/GFX12: Do not wait unnecessarily before barriers (llvm#154970)"
This reverts commit 4676242.
1 parent bc643b3 commit a77249b

File tree

5 files changed

+36
-30
lines changed

5 files changed

+36
-30
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2013,19 +2013,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
20132013
}
20142014
}
20152015

2016-
// Ensure safety against exceptions from outstanding memory operations while
2017-
// waiting for a barrier:
2018-
//
2019-
// * Some subtargets safely handle backing off the barrier in hardware
2020-
// when an exception occurs.
2021-
// * Some subtargets have an implicit S_WAITCNT 0 before barriers, so that
2022-
// there can be no outstanding memory operations during the wait.
2023-
// * Subtargets with split barriers don't need to back off the barrier; it
2024-
// is up to the trap handler to preserve the user barrier state correctly.
2025-
//
2026-
// In all other cases, ensure safety by ensuring that there are no outstanding
2027-
// memory operations.
2028-
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
2016+
// The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
2017+
// not, we need to ensure the subtarget is capable of backing off barrier
2018+
// instructions in case there are any outstanding memory operations that may
2019+
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
2020+
if (TII->isBarrierStart(MI.getOpcode()) &&
20292021
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
20302022
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
20312023
}

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -987,13 +987,19 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
987987
return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
988988
}
989989

990-
bool isBarrier(unsigned Opcode) const {
990+
// Check to see if opcode is for a barrier start. Pre gfx12 this is just the
991+
// S_BARRIER, but after support for S_BARRIER_SIGNAL* / S_BARRIER_WAIT we want
992+
// to check for the barrier start (S_BARRIER_SIGNAL*)
993+
bool isBarrierStart(unsigned Opcode) const {
991994
return Opcode == AMDGPU::S_BARRIER ||
992995
Opcode == AMDGPU::S_BARRIER_SIGNAL_M0 ||
993996
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0 ||
994997
Opcode == AMDGPU::S_BARRIER_SIGNAL_IMM ||
995-
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM ||
996-
Opcode == AMDGPU::S_BARRIER_WAIT ||
998+
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
999+
}
1000+
1001+
bool isBarrier(unsigned Opcode) const {
1002+
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
9971003
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
9981004
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
9991005
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
9898
; VARIANT4-NEXT: s_wait_kmcnt 0x0
9999
; VARIANT4-NEXT: v_xad_u32 v0, v2, -1, s2
100100
; VARIANT4-NEXT: global_store_b32 v3, v2, s[0:1]
101+
; VARIANT4-NEXT: s_wait_storecnt 0x0
101102
; VARIANT4-NEXT: s_barrier_signal -1
102103
; VARIANT4-NEXT: s_barrier_wait -1
103104
; VARIANT4-NEXT: v_ashrrev_i32_e32 v1, 31, v0
@@ -144,6 +145,7 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
144145
; VARIANT6-NEXT: v_sub_nc_u32_e32 v0, s2, v4
145146
; VARIANT6-NEXT: global_store_b32 v5, v4, s[0:1]
146147
; VARIANT6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
148+
; VARIANT6-NEXT: s_wait_storecnt 0x0
147149
; VARIANT6-NEXT: s_barrier_signal -1
148150
; VARIANT6-NEXT: s_barrier_wait -1
149151
; VARIANT6-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.signal.isfirst.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ define i1 @func1() {
1111
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
1212
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
1313
; GFX12-SDAG-NEXT: s_cmp_eq_u32 0, 0
14+
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
1415
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
1516
; GFX12-SDAG-NEXT: s_cselect_b32 s0, -1, 0
1617
; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
@@ -26,6 +27,7 @@ define i1 @func1() {
2627
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
2728
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
2829
; GFX12-GISEL-NEXT: s_cmp_eq_u32 0, 0
30+
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
2931
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
3032
; GFX12-GISEL-NEXT: s_cselect_b32 s0, 1, 0
3133
; GFX12-GISEL-NEXT: s_wait_alu 0xfffe

llvm/test/CodeGen/AMDGPU/s-barrier.ll

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@ define void @func1() {
1414
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
1515
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
1616
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
17-
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
18-
; GFX12-SDAG-NEXT: s_barrier_join m0
1917
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70003
18+
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
2019
; GFX12-SDAG-NEXT: s_barrier_signal m0
20+
; GFX12-SDAG-NEXT: s_mov_b32 m0, 3
21+
; GFX12-SDAG-NEXT: s_barrier_join m0
2122
; GFX12-SDAG-NEXT: s_barrier_wait 1
2223
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
2324
;
@@ -29,12 +30,13 @@ define void @func1() {
2930
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
3031
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
3132
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70003
32-
; GFX12-GISEL-NEXT: s_barrier_join 3
33+
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
3334
; GFX12-GISEL-NEXT: s_barrier_signal m0
35+
; GFX12-GISEL-NEXT: s_barrier_join 3
3436
; GFX12-GISEL-NEXT: s_barrier_wait 1
3537
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
36-
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
3738
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar3, i32 7)
39+
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar3)
3840
call void @llvm.amdgcn.s.barrier.wait(i16 1)
3941
ret void
4042
}
@@ -47,10 +49,11 @@ define void @func2() {
4749
; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
4850
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
4951
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
50-
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
51-
; GFX12-SDAG-NEXT: s_barrier_join m0
5252
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70001
53+
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
5354
; GFX12-SDAG-NEXT: s_barrier_signal m0
55+
; GFX12-SDAG-NEXT: s_mov_b32 m0, 1
56+
; GFX12-SDAG-NEXT: s_barrier_join m0
5457
; GFX12-SDAG-NEXT: s_barrier_wait 1
5558
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
5659
;
@@ -62,12 +65,13 @@ define void @func2() {
6265
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
6366
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
6467
; GFX12-GISEL-NEXT: s_mov_b32 m0, 0x70001
65-
; GFX12-GISEL-NEXT: s_barrier_join 1
68+
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
6669
; GFX12-GISEL-NEXT: s_barrier_signal m0
70+
; GFX12-GISEL-NEXT: s_barrier_join 1
6771
; GFX12-GISEL-NEXT: s_barrier_wait 1
6872
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
69-
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
7073
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar2, i32 7)
74+
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) @bar2)
7175
call void @llvm.amdgcn.s.barrier.wait(i16 1)
7276
ret void
7377
}
@@ -98,9 +102,9 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
98102
; GFX12-SDAG-NEXT: s_barrier_signal m0
99103
; GFX12-SDAG-NEXT: s_mov_b32 m0, s2
100104
; GFX12-SDAG-NEXT: s_barrier_signal -1
105+
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
101106
; GFX12-SDAG-NEXT: s_barrier_join m0
102107
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
103-
; GFX12-SDAG-NEXT: s_barrier_signal_isfirst -1
104108
; GFX12-SDAG-NEXT: s_barrier_wait 1
105109
; GFX12-SDAG-NEXT: s_barrier_leave
106110
; GFX12-SDAG-NEXT: s_get_barrier_state s3, m0
@@ -151,11 +155,11 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
151155
; GFX12-GISEL-NEXT: s_barrier_signal m0
152156
; GFX12-GISEL-NEXT: s_mov_b32 m0, s1
153157
; GFX12-GISEL-NEXT: s_barrier_signal m0
154-
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
155158
; GFX12-GISEL-NEXT: s_barrier_signal -1
156-
; GFX12-GISEL-NEXT: s_barrier_join m0
157159
; GFX12-GISEL-NEXT: s_barrier_signal_isfirst -1
160+
; GFX12-GISEL-NEXT: s_mov_b32 m0, s0
158161
; GFX12-GISEL-NEXT: s_add_co_u32 s8, s12, 48
162+
; GFX12-GISEL-NEXT: s_barrier_join m0
159163
; GFX12-GISEL-NEXT: s_barrier_wait 1
160164
; GFX12-GISEL-NEXT: s_barrier_leave
161165
; GFX12-GISEL-NEXT: s_get_barrier_state s0, 2
@@ -190,8 +194,8 @@ define amdgpu_kernel void @kernel1(ptr addrspace(1) %out, ptr addrspace(3) %in)
190194
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 12)
191195
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) %in, i32 9)
192196
call void @llvm.amdgcn.s.barrier.signal(i32 -1)
193-
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
194197
%isfirst = call i1 @llvm.amdgcn.s.barrier.signal.isfirst(i32 -1)
198+
call void @llvm.amdgcn.s.barrier.join(ptr addrspace(3) %in)
195199
call void @llvm.amdgcn.s.barrier.wait(i16 1)
196200
call void @llvm.amdgcn.s.barrier.leave(i16 1)
197201
%state = call i32 @llvm.amdgcn.s.get.named.barrier.state(ptr addrspace(3) @bar)
@@ -215,14 +219,14 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
215219
; GFX12-SDAG-NEXT: s_load_b64 s[12:13], s[6:7], 0x0
216220
; GFX12-SDAG-NEXT: s_mov_b32 m0, 0x70002
217221
; GFX12-SDAG-NEXT: s_add_nc_u64 s[8:9], s[4:5], 48
222+
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
218223
; GFX12-SDAG-NEXT: s_barrier_signal m0
219224
; GFX12-SDAG-NEXT: s_mov_b32 m0, 2
220225
; GFX12-SDAG-NEXT: s_mov_b64 s[4:5], s[0:1]
221226
; GFX12-SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
222227
; GFX12-SDAG-NEXT: s_mov_b32 s32, 0
223228
; GFX12-SDAG-NEXT: s_barrier_join m0
224229
; GFX12-SDAG-NEXT: s_barrier_wait 1
225-
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
226230
; GFX12-SDAG-NEXT: s_swappc_b64 s[30:31], s[12:13]
227231
; GFX12-SDAG-NEXT: s_endpgm
228232
;
@@ -241,10 +245,10 @@ define amdgpu_kernel void @kernel2(ptr addrspace(1) %out, ptr addrspace(3) %in)
241245
; GFX12-GISEL-NEXT: s_mov_b64 s[4:5], s[0:1]
242246
; GFX12-GISEL-NEXT: s_mov_b64 s[6:7], s[2:3]
243247
; GFX12-GISEL-NEXT: s_mov_b32 s32, 0
248+
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
244249
; GFX12-GISEL-NEXT: s_barrier_signal m0
245250
; GFX12-GISEL-NEXT: s_barrier_join 2
246251
; GFX12-GISEL-NEXT: s_barrier_wait 1
247-
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
248252
; GFX12-GISEL-NEXT: s_swappc_b64 s[30:31], s[12:13]
249253
; GFX12-GISEL-NEXT: s_endpgm
250254
call void @llvm.amdgcn.s.barrier.signal.var(ptr addrspace(3) @bar, i32 7)

0 commit comments

Comments
 (0)