Skip to content

Commit 2bd7d29

Browse files
committed
[AArch64][SME] Support aarch64-split-sve-objects with VLAs/realignment
This was left out of the original patch (llvm#142392) to simplify the initial implementation. However, after refactoring the SVE prologue/epilogue code in llvm#162253, it's not much of an extension to support this case. The main change here is when restoring the SP from the FP for the SVE restores, we may need an additional frame offset to move from the start of the ZPR callee-saves to the start of the PPR callee-saves. This patch also fixes a previously latent bug where we'd add the `RealignmentPadding` when allocating the PPR locals, then again for the ZPR locals. This was unnecessary as the stack only needs to be realigned after all SVE allocations.
1 parent 8b93f27 commit 2bd7d29

File tree

6 files changed

+1490
-802
lines changed

6 files changed

+1490
-802
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2373,13 +2373,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
23732373
return;
23742374
}
23752375

2376-
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2377-
if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
2378-
LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
2379-
"sized objects or realignment\n");
2380-
return;
2381-
}
2382-
23832376
// If another calling convention is explicitly set FPRs can't be promoted to
23842377
// ZPR callee-saves.
23852378
if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2395,6 +2388,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
23952388
assert(Subtarget.isSVEorStreamingSVEAvailable() &&
23962389
"Expected SVE to be available for PPRs");
23972390

2391+
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
23982392
// With SplitSVEObjects the CS hazard padding is placed between the
23992393
// PPRs and ZPRs. If there are any FPR CS there would be a hazard between
24002394
// them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -793,10 +793,9 @@ void AArch64PrologueEmitter::emitPrologue() {
793793
CFAOffset += AllocateBeforePPRs;
794794
assert(PPRRange.End == ZPRRange.Begin &&
795795
"Expected ZPR callee saves after PPR locals");
796-
allocateStackSpace(PPRRange.End, RealignmentPadding, AllocateAfterPPRs,
797-
EmitAsyncCFI && !HasFP, CFAOffset,
798-
MFI.hasVarSizedObjects() || ZPR.LocalsSize ||
799-
NonSVELocalsSize);
796+
allocateStackSpace(
797+
PPRRange.End, 0, AllocateAfterPPRs, EmitAsyncCFI && !HasFP, CFAOffset,
798+
MFI.hasVarSizedObjects() || ZPR.LocalsSize || NonSVELocalsSize);
800799
CFAOffset += AllocateAfterPPRs;
801800
} else {
802801
assert(SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord);
@@ -1308,6 +1307,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
13081307
SEHEpilogueStartI = MBB.end();
13091308
}
13101309

1310+
void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
1311+
StackOffset Offset) {
1312+
// Other combinations could be supported, but are not currently needed.
1313+
assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
1314+
"expected negative offset (with optional fixed portion)");
1315+
Register Base = AArch64::FP;
1316+
if (int64_t FixedOffset = Offset.getFixed()) {
1317+
// If we have a negative fixed offset, we need to first subtract it in a
1318+
// temporary register first (to avoid briefly deallocating the scalable
1319+
// portion of the offset).
1320+
Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
1321+
emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
1322+
StackOffset::getFixed(FixedOffset), TII,
1323+
MachineInstr::FrameDestroy);
1324+
}
1325+
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
1326+
StackOffset::getScalable(Offset.getScalable()), TII,
1327+
MachineInstr::FrameDestroy);
1328+
}
1329+
13111330
void AArch64EpilogueEmitter::emitEpilogue() {
13121331
MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
13131332
if (MBB.end() != EpilogueEndI) {
@@ -1408,6 +1427,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14081427
AfterCSRPopSize += ProloguePopSize;
14091428
}
14101429
}
1430+
14111431
// Move past the restores of the callee-saved registers.
14121432
// If we plan on combining the sp bump of the local stack size and the callee
14131433
// save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1474,8 +1494,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14741494
StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
14751495
StackOffset SVEStackSize =
14761496
SVECalleeSavesSize + PPR.LocalsSize + ZPR.LocalsSize;
1477-
MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
1478-
MachineBasicBlock::iterator RestoreEnd = PPRRange.End;
14791497

14801498
// Deallocate the SVE area.
14811499
if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1490,7 +1508,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
14901508
}
14911509

14921510
// Deallocate callee-save SVE registers.
1493-
emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
1511+
emitFrameOffset(MBB, PPRRange.End, DL, AArch64::SP, AArch64::SP,
14941512
SVECalleeSavesSize, TII, MachineInstr::FrameDestroy, false,
14951513
NeedsWinCFI, &HasWinCFI);
14961514
} else if (AFI->hasSVEStackSize()) {
@@ -1501,28 +1519,26 @@ void AArch64EpilogueEmitter::emitEpilogue() {
15011519
(AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
15021520
: AArch64::SP;
15031521
if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
1504-
// TODO: Support stack realigment and variable-sized objects.
1505-
assert(
1506-
SVELayout != SVEStackLayout::Split &&
1507-
"unexpected stack realignment or variable sized objects with split "
1508-
"SVE stack objects");
1509-
1510-
Register CalleeSaveBase = AArch64::FP;
1511-
if (int64_t CalleeSaveBaseOffset =
1512-
AFI->getCalleeSaveBaseToFrameRecordOffset()) {
1513-
// If we have have an non-zero offset to the non-SVE CS base we need to
1514-
// compute the base address by subtracting the offest in a temporary
1515-
// register first (to avoid briefly deallocating the SVE CS).
1516-
CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
1517-
&AArch64::GPR64RegClass);
1518-
emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
1519-
StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
1522+
// The offset from the frame-pointer to the start of the ZPR/PPR CSRs.
1523+
StackOffset FPOffsetZPRCSRs =
1524+
-SVECalleeSavesSize -
1525+
StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
1526+
StackOffset FPOffsetPPRCSRs = FPOffsetZPRCSRs + ZPR.CalleeSavesSize;
1527+
1528+
// With split SVE, the PPR locals are above the ZPR callee-saves.
1529+
if (ZPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split)
1530+
FPOffsetZPRCSRs -= PPR.LocalsSize;
1531+
1532+
// The code below will deallocate the stack space space by moving the SP
1533+
// to the start of the ZPR/PPR callee-save area.
1534+
moveSPBelowFP(ZPRRange.Begin, FPOffsetZPRCSRs);
1535+
1536+
if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
1537+
// Move to the start of the PPR area (this offset may be zero).
1538+
emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::SP,
1539+
FPOffsetPPRCSRs - FPOffsetZPRCSRs, TII,
15201540
MachineInstr::FrameDestroy);
15211541
}
1522-
// The code below will deallocate the stack space space by moving the SP
1523-
// to the start of the SVE callee-save area.
1524-
emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
1525-
-SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
15261542
} else if (BaseForSVEDealloc == AArch64::SP) {
15271543
auto CFAOffset =
15281544
SVEStackSize + StackOffset::getFixed(NumBytes + PrologueSaveSize);

llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
174174
private:
175175
bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
176176

177+
/// A helper for moving the SP to a negative offset from the FP, without
178+
/// deallocating any stack in the range FP to FP + Offset.
179+
void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
180+
177181
void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
178182
const DebugLoc &DL) const;
179183

llvm/test/CodeGen/AArch64/framelayout-split-sve.mir

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -182,63 +182,56 @@ body: |
182182
RET_ReallyLR
183183
184184
# CHECK-LABEL: name: test_allocate_split_sve_realigned
185-
# CHECK: stackSize: 2080
185+
# CHECK: stackSize: 1056
186186

187187
# CHECK: bb.0.entry:
188188
# CHECK: liveins: $z0, $p0, $lr
189-
# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
190-
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
191-
# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
192-
# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
193-
# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
189+
# CHECK: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
190+
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
191+
# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
194192
# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
195193
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
196194
# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
197-
# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
195+
# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
196+
# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
197+
# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
198198
# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
199-
# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
199+
# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
200200
#
201201
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
202202
# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
203-
# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
204-
# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
205-
# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
203+
# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
204+
# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
206205
#
207-
# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
208-
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
209-
# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
210-
# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
211-
# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
206+
# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
207+
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
208+
# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
212209
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
213210
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
214211
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
215212
# CHECK-NEXT: RET_ReallyLR
216213

217214
# ASM-LABEL: test_allocate_split_sve_realigned
218-
# ASM: sub sp, sp, #1040
219-
# ASM-NEXT: .cfi_def_cfa_offset 1040
220-
# ASM-NEXT: str x29, [sp, #1024]
221-
# ASM-NEXT: str x30, [sp, #1032]
222-
# ASM-NEXT: add x29, sp, #1024
215+
# ASM: stp x29, x30, [sp, #-16]!
216+
# ASM-NEXT: .cfi_def_cfa_offset 16
217+
# ASM-NEXT: mov x29, sp
223218
# ASM-NEXT: .cfi_def_cfa w29, 16
224219
# ASM-NEXT: .cfi_offset w30, -8
225220
# ASM-NEXT: .cfi_offset w29, -16
226221
#
227-
# ASM: sub sp, x29, #1024
228-
# ASM-NEXT: .cfi_def_cfa wsp, 1040
229-
# ASM-NEXT: ldr x30, [sp, #1032]
230-
# ASM-NEXT: ldr x29, [sp, #1024]
231-
# ASM-NEXT: add sp, sp, #1040
222+
# ASM: mov sp, x29
223+
# ASM-NEXT: .cfi_def_cfa wsp, 16
224+
# ASM-NEXT: ldp x29, x30, [sp], #16
232225
# ASM-NEXT: .cfi_def_cfa_offset 0
233226
# ASM-NEXT: .cfi_restore w30
234227
# ASM-NEXT: .cfi_restore w29
235228

236-
# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
229+
# UNWINDINFO: DW_CFA_def_cfa_offset: +16
237230
# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
238231
# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
239232
# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
240233
#
241-
# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
234+
# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
242235
# UNWINDINFO: DW_CFA_def_cfa_offset: +0
243236
# UNWINDINFO-NEXT: DW_CFA_restore: reg30
244237
# UNWINDINFO-NEXT: DW_CFA_restore: reg29

0 commit comments

Comments
 (0)