-
Notifications
You must be signed in to change notification settings - Fork 15.2k
AMDGPU: Figure out required AGPR count for inline asm #150910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesFor now just try to compute the minimum number of AGPRs required Full diff: https://github.com/llvm/llvm-project/pull/150910.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 49d8b4447adfd..f0dbd81c874fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1201,16 +1201,61 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}
-static bool inlineAsmUsesAGPRs(const InlineAsm *IA) {
- for (const auto &CI : IA->ParseConstraints()) {
+/// Compute the minimum number of AGPRs required to allocate the inline asm.
+static unsigned inlineAsmGetNumRequiredAGPRs(const InlineAsm *IA,
+ const CallBase &Call) {
+ unsigned ArgNo = 0;
+ unsigned ResNo = 0;
+ unsigned AGPRDefCount = 0;
+ unsigned AGPRUseCount = 0;
+ unsigned MaxPhysReg = 0;
+ const DataLayout &DL = Call.getFunction()->getParent()->getDataLayout();
+
+ for (const InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+ Type *Ty = nullptr;
+ switch (CI.Type) {
+ case InlineAsm::isOutput: {
+ Ty = Call.getType();
+ if (auto *STy = dyn_cast<StructType>(Ty))
+ Ty = STy->getElementType(ResNo);
+ ++ResNo;
+ break;
+ }
+ case InlineAsm::isInput: {
+ Ty = Call.getArgOperand(ArgNo++)->getType();
+ break;
+ }
+ case InlineAsm::isLabel:
+ continue;
+ case InlineAsm::isClobber:
+ // Parse the physical register reference.
+ break;
+ }
+
for (StringRef Code : CI.Codes) {
- Code.consume_front("{");
- if (Code.starts_with("a"))
- return true;
+ if (Code.starts_with("a")) {
+ // Virtual register, compute number of registers based on the type.
+ //
+ // We ought to be going through TargetLowering to get the number of
+ // registers, but we should avoid the dependence on CodeGen here.
+ unsigned RegCount = divideCeil(DL.getTypeSizeInBits(Ty), 32);
+ if (CI.Type == InlineAsm::isOutput) {
+ AGPRDefCount += RegCount;
+ if (CI.isEarlyClobber)
+ AGPRUseCount += RegCount;
+ } else
+ AGPRUseCount += RegCount;
+ } else {
+ // Physical register reference
+ auto [Kind, RegIdx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Code);
+ if (Kind == 'a')
+ MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
+ }
}
}
- return false;
+ unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
+ return std::min(MaxVirtReg + MaxPhysReg, 256u);
}
// TODO: Migrate to range merge of amdgpu-agpr-alloc.
@@ -1252,7 +1297,7 @@ struct AAAMDGPUNoAGPR
const Function *Callee = dyn_cast<Function>(CalleeOp);
if (!Callee) {
if (const InlineAsm *IA = dyn_cast<InlineAsm>(CalleeOp))
- return !inlineAsmUsesAGPRs(IA);
+ return inlineAsmGetNumRequiredAGPRs(IA, CB) == 0;
return false;
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 181dab8d4ca79..e502995cdb8ea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -251,6 +251,205 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
ret void
}
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, i32} asm sideeffect "; def $0", "=a,=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_struct_2(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, <2 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, <2 x i32>} asm sideeffect "; def $0", "=a,=v"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(ptr poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call ptr asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call ptr asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_vector_ptr_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <2 x ptr> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <2 x ptr> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_struct_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { i32, i32 } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {i32, i32} asm sideeffect "; def $0", "={a0},={a[4:5]}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a4}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a[10:13]}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a256}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_clobber_max() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_clobber_max(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; clobber $0", "~{a255}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_oob() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_oob(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a256}"(i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <32 x i32> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<32 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_use_def_max_ty(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <32 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <32 x i32> asm sideeffect "; use $0", "=a,a"(<32 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @vreg_use_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_use_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(<257 x i32> poison)
+ ret void
+}
+
+define amdgpu_kernel void @vreg_def_exceeds_register_file() {
+; CHECK-LABEL: define amdgpu_kernel void @vreg_def_exceeds_register_file(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <257 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <257 x i32> asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @multiple() {
+; CHECK-LABEL: define amdgpu_kernel void @multiple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { <16 x i32>, <8 x i32>, <8 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call {<16 x i32>, <8 x i32>, <8 x i32>} asm sideeffect "; def $0", "=a,=a,=a,a,a,a"(<4 x i32> splat (i32 0), <8 x i32> splat (i32 1), i64 999)
+ ret void
+}
+
+define amdgpu_kernel void @earlyclobber_0() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_0(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call <8 x i32> asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call <8 x i32> asm sideeffect "; def $0", "=&a,a"(i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @earlyclobber_1() {
+; CHECK-LABEL: define amdgpu_kernel void @earlyclobber_1(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call { <8 x i32>, <16 x i32> } asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call { <8 x i32>, <16 x i32 > } asm sideeffect "; def $0, $1", "=&a,=&a,a,a"(i32 0, <16 x i32> splat (i32 1))
+ ret void
+}
attributes #0 = { "amdgpu-agpr-alloc"="0" }
;.
|
ae3f032
to
b917c11
Compare
98879a5
to
0a6415a
Compare
0a6415a
to
2cef45d
Compare
fc3aa2a
to
c7942b3
Compare
|
||
return false; | ||
unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount); | ||
return std::min(std::max(MaxVirtReg, MaxPhysReg), 256u); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wouldn't this be wrong for inline ASM that, for example, has a virtual register constraint for 4 dwords and a physical register constraint for a[1:4]
? This implementation would say that 5 AGPRs are required, but it needs 8 or 9 (depending on whether a0
, which can't be used for either operand, is counted, I'm not quite clear on what the function should compute for this).
I think it's now at least a sound under-approximation of the minimum number of AGPRs required to allocate the inline asm, is that enough for this function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This example reports 8, I'll add this as a new test
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, I missed that RegCount is also set in the physical case.
If the expected answer for this example is 8 (and not 9), even though this can only be allocated using a[1:4]
and a[5:8]
(or higher indices; because the virtual registers are also contiguous), why do you need to check MaxPhysReg
? It seems odd to me that this would report 8, but inline ASM that just has a physical register constraint for a[5:8]
would count as 9.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The goal is not really to count the number of registers this could optimally use, it is to find the maximum AGPR index that we need to ensure is allocated. Any direct physical register reference sets the limit
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't we then compute 9 for the initial example (a virtual register constraint for 4 dwords and a physical register constraint for a[1:4]
), since we need to ensure that a8
is allocated so that the 4 dwords for the virtual register constraint fit after a[1:4]
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't tried to account for alignment, but basically this function is never going to be accurate until it's easily testable. With #162300 you can start reading out the computed values easily
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree that this logic needs tests. I think this is not a problem for the == 0
check it's currently only used in, so I'm fine with moving this along for now so that #162300 can land and the implementation can be improved. Maybe add a comment/TODO that it's currently not accurate/should be improved?
c7942b3
to
43894ee
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
inlineAsmGetNumRequiredAGPRs
should be made more accurate once it becomes testable.
Made much more conservative, allocating all virtregs after physregs and applying tuple width alignment unconditionally |
For now just try to compute the minimum number of AGPRs required to allocate the asm. Leave the attributor changes to turn this into an integer value for later.
a4a8e64
to
7234f29
Compare
For now just try to compute the minimum number of AGPRs required
to allocate the asm. Leave the attributor changes to turn this
into an integer value for later.