@@ -1634,6 +1634,9 @@ static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
16341634  case  AArch64::STR_PXI:
16351635  case  AArch64::LDR_ZXI:
16361636  case  AArch64::LDR_PXI:
1637+   case  AArch64::PTRUE_B:
1638+   case  AArch64::CPY_ZPzI_B:
1639+   case  AArch64::CMPNE_PPzZI_B:
16371640    return  I->getFlag (MachineInstr::FrameSetup) ||
16381641           I->getFlag (MachineInstr::FrameDestroy);
16391642  }
@@ -3265,7 +3268,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
32653268      StrOpc = RPI.isPaired () ? AArch64::ST1B_2Z_IMM : AArch64::STR_ZXI;
32663269      break ;
32673270    case  RegPairInfo::PPR:
3268-       StrOpc = AArch64::STR_PXI;
3271+       StrOpc =
3272+           Size == 16  ? AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO : AArch64::STR_PXI;
32693273      break ;
32703274    case  RegPairInfo::VG:
32713275      StrOpc = AArch64::STRXui;
@@ -3494,7 +3498,8 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
34943498      LdrOpc = RPI.isPaired () ? AArch64::LD1B_2Z_IMM : AArch64::LDR_ZXI;
34953499      break ;
34963500    case  RegPairInfo::PPR:
3497-       LdrOpc = AArch64::LDR_PXI;
3501+       LdrOpc = Size == 16  ? AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO
3502+                           : AArch64::LDR_PXI;
34983503      break ;
34993504    case  RegPairInfo::VG:
35003505      continue ;
@@ -3720,6 +3725,14 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
37203725      continue ;
37213726    }
37223727
3728+     //  Always save P4 when PPR spills are ZPR-sized and a predicate above p8 is
3729+     //  spilled. If all of p0-p3 are used as return values p4 is must be free
3730+     //  to reload p8-p15.
3731+     if  (RegInfo->getSpillSize (AArch64::PPRRegClass) == 16  &&
3732+         AArch64::PPR_p8to15RegClass.contains (Reg)) {
3733+       SavedRegs.set (AArch64::P4);
3734+     }
3735+ 
37233736    //  MachO's compact unwind format relies on all registers being stored in
37243737    //  pairs.
37253738    //  FIXME: the usual format is actually better if unwinding isn't needed.
@@ -4159,8 +4172,295 @@ int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
41594172                                        true );
41604173}
41614174
4175+ // / Attempts to scavenge a register from \p ScavengeableRegs given the used
4176+ // / registers in \p UsedRegs.
4177+ static  Register tryScavengeRegister (LiveRegUnits const  &UsedRegs,
4178+                                     BitVector const  &ScavengeableRegs) {
4179+   for  (auto  Reg : ScavengeableRegs.set_bits ()) {
4180+     if  (UsedRegs.available (Reg))
4181+       return  Reg;
4182+   }
4183+   return  AArch64::NoRegister;
4184+ }
4185+ 
4186+ // / Propagates frame-setup/destroy flags from \p SourceMI to all instructions in
4187+ // / \p MachineInstrs.
4188+ static  void  propagateFrameFlags (MachineInstr &SourceMI,
4189+                                 ArrayRef<MachineInstr *> MachineInstrs) {
4190+   for  (MachineInstr *MI : MachineInstrs) {
4191+     if  (SourceMI.getFlag (MachineInstr::FrameSetup))
4192+       MI->setFlag (MachineInstr::FrameSetup);
4193+     if  (SourceMI.getFlag (MachineInstr::FrameDestroy))
4194+       MI->setFlag (MachineInstr::FrameDestroy);
4195+   }
4196+ }
4197+ 
4198+ // / RAII helper class for scavenging or spilling a register. On construction
4199+ // / attempts to find a free register of class \p RC (given \p UsedRegs and \p
4200+ // / AllocatableRegs), if no register can be found spills \p SpillCandidate to \p
4201+ // / MaybeSpillFI to free a register. The free'd register is returned via the \p
4202+ // / FreeReg output parameter. On destruction, if there is a spill, its previous
4203+ // / value is reloaded. The spilling and scavenging is only valid at the
4204+ // / insertion point \p MBBI, this class should _not_ be used in places that
4205+ // / create or manipulate basic blocks, moving the expected insertion point.
4206+ struct  ScopedScavengeOrSpill  {
4207+   ScopedScavengeOrSpill (const  ScopedScavengeOrSpill &) = delete ;
4208+   ScopedScavengeOrSpill (ScopedScavengeOrSpill &&) = delete ;
4209+ 
4210+   ScopedScavengeOrSpill (MachineFunction &MF, MachineBasicBlock &MBB,
4211+                         MachineBasicBlock::iterator MBBI,
4212+                         Register SpillCandidate, const  TargetRegisterClass &RC,
4213+                         LiveRegUnits const  &UsedRegs,
4214+                         BitVector const  &AllocatableRegs,
4215+                         std::optional<int > *MaybeSpillFI)
4216+       : MBB(MBB), MBBI(MBBI), RC(RC), TII(static_cast <const  AArch64InstrInfo &>(
4217+                                           *MF.getSubtarget().getInstrInfo())),
4218+         TRI (*MF.getSubtarget().getRegisterInfo()) {
4219+     FreeReg = tryScavengeRegister (UsedRegs, AllocatableRegs);
4220+     if  (FreeReg != AArch64::NoRegister)
4221+       return ;
4222+     assert (MaybeSpillFI && " Expected emergency spill slot FI information " 
4223+                            " (attempted to spill in prologue/epilogue?)" 
4224+     if  (!MaybeSpillFI->has_value ()) {
4225+       MachineFrameInfo &MFI = MF.getFrameInfo ();
4226+       *MaybeSpillFI = MFI.CreateSpillStackObject (TRI.getSpillSize (RC),
4227+                                                  TRI.getSpillAlign (RC));
4228+     }
4229+     FreeReg = SpillCandidate;
4230+     SpillFI = MaybeSpillFI->value ();
4231+     TII.storeRegToStackSlot (MBB, MBBI, FreeReg, false , *SpillFI, &RC, &TRI,
4232+                             Register ());
4233+   }
4234+ 
4235+   bool  hasSpilled () const  { return  SpillFI.has_value (); }
4236+ 
4237+   // / Returns the free register (found from scavenging or spilling a register).
4238+   Register freeRegister () const  { return  FreeReg; }
4239+ 
4240+   Register operator *() const  { return  freeRegister (); }
4241+ 
4242+   ~ScopedScavengeOrSpill () {
4243+     if  (hasSpilled ())
4244+       TII.loadRegFromStackSlot (MBB, MBBI, FreeReg, *SpillFI, &RC, &TRI,
4245+                                Register ());
4246+   }
4247+ 
4248+ private: 
4249+   MachineBasicBlock &MBB;
4250+   MachineBasicBlock::iterator MBBI;
4251+   const  TargetRegisterClass &RC;
4252+   const  AArch64InstrInfo &TII;
4253+   const  TargetRegisterInfo &TRI;
4254+   Register FreeReg = AArch64::NoRegister;
4255+   std::optional<int > SpillFI;
4256+ };
4257+ 
4258+ // / Emergency stack slots for expanding SPILL_PPR_TO_ZPR_SLOT_PSEUDO and
4259+ // / FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4260+ struct  EmergencyStackSlots  {
4261+   std::optional<int > ZPRSpillFI;
4262+   std::optional<int > PPRSpillFI;
4263+   std::optional<int > GPRSpillFI;
4264+ };
4265+ 
4266+ // / Registers available for scavenging (ZPR, PPR3b, GPR).
4267+ struct  ScavengeableRegs  {
4268+   BitVector ZPRRegs;
4269+   BitVector PPR3bRegs;
4270+   BitVector GPRRegs;
4271+ };
4272+ 
4273+ static  bool  isInPrologueOrEpilogue (const  MachineInstr &MI) {
4274+   return  MI.getFlag (MachineInstr::FrameSetup) ||
4275+          MI.getFlag (MachineInstr::FrameDestroy);
4276+ }
4277+ 
4278+ // / Expands:
4279+ // / ```
4280+ // / SPILL_PPR_TO_ZPR_SLOT_PSEUDO $p0, %stack.0, 0
4281+ // / ```
4282+ // / To:
4283+ // / ```
4284+ // / $z0 = CPY_ZPzI_B $p0, 1, 0
4285+ // / STR_ZXI $z0, $stack.0, 0
4286+ // / ```
4287+ // / While ensuring a ZPR ($z0 in this example) is free for the predicate (
4288+ // / spilling if necessary).
4289+ static  void  expandSpillPPRToZPRSlotPseudo (MachineBasicBlock &MBB,
4290+                                           MachineInstr &MI,
4291+                                           const  TargetRegisterInfo &TRI,
4292+                                           LiveRegUnits const  &UsedRegs,
4293+                                           ScavengeableRegs const  &SR,
4294+                                           EmergencyStackSlots &SpillSlots) {
4295+   MachineFunction &MF = *MBB.getParent ();
4296+   auto  *TII =
4297+       static_cast <const  AArch64InstrInfo *>(MF.getSubtarget ().getInstrInfo ());
4298+ 
4299+   ScopedScavengeOrSpill ZPredReg (
4300+       MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs ,
4301+       isInPrologueOrEpilogue (MI) ? nullptr  : &SpillSlots.ZPRSpillFI );
4302+ 
4303+   SmallVector<MachineInstr *, 2 > MachineInstrs;
4304+   const  DebugLoc &DL = MI.getDebugLoc ();
4305+   MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::CPY_ZPzI_B))
4306+                               .addReg (*ZPredReg, RegState::Define)
4307+                               .add (MI.getOperand (0 ))
4308+                               .addImm (1 )
4309+                               .addImm (0 )
4310+                               .getInstr ());
4311+   MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::STR_ZXI))
4312+                               .addReg (*ZPredReg)
4313+                               .add (MI.getOperand (1 ))
4314+                               .addImm (MI.getOperand (2 ).getImm ())
4315+                               .setMemRefs (MI.memoperands ())
4316+                               .getInstr ());
4317+   propagateFrameFlags (MI, MachineInstrs);
4318+ }
4319+ 
4320+ // / Expands:
4321+ // / ```
4322+ // / $p0 = FILL_PPR_FROM_ZPR_SLOT_PSEUDO %stack.0, 0
4323+ // / ```
4324+ // / To:
4325+ // / ```
4326+ // / $z0 = LDR_ZXI %stack.0, 0
4327+ // / $p0 = PTRUE_B 31, implicit $vg
4328+ // / $p0 = CMPNE_PPzZI_B $p0, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
4329+ // / ```
4330+ // / While ensuring a ZPR ($z0 in this example) is free for the predicate (
4331+ // / spilling if necessary). If the status flags are in use at the point of
4332+ // / expansion they are preserved (by moving them to/from a GPR). This may cause
4333+ // / an additional spill if no GPR is free at the expansion point.
4334+ static  bool  expandFillPPRFromZPRSlotPseudo (MachineBasicBlock &MBB,
4335+                                            MachineInstr &MI,
4336+                                            const  TargetRegisterInfo &TRI,
4337+                                            LiveRegUnits const  &UsedRegs,
4338+                                            ScavengeableRegs const  &SR,
4339+                                            EmergencyStackSlots &SpillSlots) {
4340+   MachineFunction &MF = *MBB.getParent ();
4341+   auto  *TII =
4342+       static_cast <const  AArch64InstrInfo *>(MF.getSubtarget ().getInstrInfo ());
4343+ 
4344+   ScopedScavengeOrSpill ZPredReg (
4345+       MF, MBB, MI, AArch64::Z0, AArch64::ZPRRegClass, UsedRegs, SR.ZPRRegs ,
4346+       isInPrologueOrEpilogue (MI) ? nullptr  : &SpillSlots.ZPRSpillFI );
4347+ 
4348+   ScopedScavengeOrSpill PredReg (
4349+       MF, MBB, MI, AArch64::P0, AArch64::PPR_3bRegClass, UsedRegs, SR.PPR3bRegs ,
4350+       isInPrologueOrEpilogue (MI) ? nullptr  : &SpillSlots.PPRSpillFI );
4351+ 
4352+   //  Elide NZCV spills if we know it is not used.
4353+   bool  IsNZCVUsed = !UsedRegs.available (AArch64::NZCV);
4354+   std::optional<ScopedScavengeOrSpill> NZCVSaveReg;
4355+   if  (IsNZCVUsed)
4356+     NZCVSaveReg.emplace (
4357+         MF, MBB, MI, AArch64::X0, AArch64::GPR64RegClass, UsedRegs, SR.GPRRegs ,
4358+         isInPrologueOrEpilogue (MI) ? nullptr  : &SpillSlots.GPRSpillFI );
4359+   SmallVector<MachineInstr *, 4 > MachineInstrs;
4360+   const  DebugLoc &DL = MI.getDebugLoc ();
4361+   MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::LDR_ZXI))
4362+                               .addReg (*ZPredReg, RegState::Define)
4363+                               .add (MI.getOperand (1 ))
4364+                               .addImm (MI.getOperand (2 ).getImm ())
4365+                               .setMemRefs (MI.memoperands ())
4366+                               .getInstr ());
4367+   if  (IsNZCVUsed)
4368+     MachineInstrs.push_back (
4369+         BuildMI (MBB, MI, DL, TII->get (AArch64::MRS))
4370+             .addReg (NZCVSaveReg->freeRegister (), RegState::Define)
4371+             .addImm (AArch64SysReg::NZCV)
4372+             .addReg (AArch64::NZCV, RegState::Implicit)
4373+             .getInstr ());
4374+   MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::PTRUE_B))
4375+                               .addReg (*PredReg, RegState::Define)
4376+                               .addImm (31 ));
4377+   MachineInstrs.push_back (
4378+       BuildMI (MBB, MI, DL, TII->get (AArch64::CMPNE_PPzZI_B))
4379+           .addReg (MI.getOperand (0 ).getReg (), RegState::Define)
4380+           .addReg (*PredReg)
4381+           .addReg (*ZPredReg)
4382+           .addImm (0 )
4383+           .addReg (AArch64::NZCV, RegState::ImplicitDefine)
4384+           .getInstr ());
4385+   if  (IsNZCVUsed)
4386+     MachineInstrs.push_back (BuildMI (MBB, MI, DL, TII->get (AArch64::MSR))
4387+                                 .addImm (AArch64SysReg::NZCV)
4388+                                 .addReg (NZCVSaveReg->freeRegister ())
4389+                                 .addReg (AArch64::NZCV, RegState::ImplicitDefine)
4390+                                 .getInstr ());
4391+ 
4392+   propagateFrameFlags (MI, MachineInstrs);
4393+   return  PredReg.hasSpilled ();
4394+ }
4395+ 
4396+ // / Expands all FILL_PPR_FROM_ZPR_SLOT_PSEUDO and SPILL_PPR_TO_ZPR_SLOT_PSEUDO
4397+ // / operations within the MachineBasicBlock \p MBB.
4398+ static  bool  expandSMEPPRToZPRSpillPseudos (MachineBasicBlock &MBB,
4399+                                           const  TargetRegisterInfo &TRI,
4400+                                           ScavengeableRegs const  &SR,
4401+                                           EmergencyStackSlots &SpillSlots) {
4402+   LiveRegUnits UsedRegs (TRI);
4403+   UsedRegs.addLiveOuts (MBB);
4404+   bool  HasPPRSpills = false ;
4405+   for  (MachineInstr &MI : make_early_inc_range (reverse (MBB))) {
4406+     UsedRegs.stepBackward (MI);
4407+     switch  (MI.getOpcode ()) {
4408+     case  AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO:
4409+       HasPPRSpills |= expandFillPPRFromZPRSlotPseudo (MBB, MI, TRI, UsedRegs, SR,
4410+                                                      SpillSlots);
4411+       MI.eraseFromParent ();
4412+       break ;
4413+     case  AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO:
4414+       expandSpillPPRToZPRSlotPseudo (MBB, MI, TRI, UsedRegs, SR, SpillSlots);
4415+       MI.eraseFromParent ();
4416+       break ;
4417+     default :
4418+       break ;
4419+     }
4420+   }
4421+ 
4422+   return  HasPPRSpills;
4423+ }
4424+ 
41624425void  AArch64FrameLowering::processFunctionBeforeFrameFinalized (
41634426    MachineFunction &MF, RegScavenger *RS) const  {
4427+ 
4428+   AArch64FunctionInfo *AFI = MF.getInfo <AArch64FunctionInfo>();
4429+   const  TargetSubtargetInfo &TSI = MF.getSubtarget ();
4430+   const  TargetRegisterInfo &TRI = *TSI.getRegisterInfo ();
4431+ 
4432+   //  If predicates spills are 16-bytes we may need to expand
4433+   //  SPILL_PPR_TO_ZPR_SLOT_PSEUDO/FILL_PPR_FROM_ZPR_SLOT_PSEUDO.
4434+   if  (AFI->hasStackFrame () && TRI.getSpillSize (AArch64::PPRRegClass) == 16 ) {
4435+     auto  ComputeScavengeableRegisters = [&](unsigned  RegClassID) {
4436+       BitVector Regs = TRI.getAllocatableSet (MF, TRI.getRegClass (RegClassID));
4437+       assert (Regs.count () > 0  && " Expected scavengeable registers" 
4438+       return  Regs;
4439+     };
4440+ 
4441+     ScavengeableRegs SR{};
4442+     SR.ZPRRegs  = ComputeScavengeableRegisters (AArch64::ZPRRegClassID);
4443+     //  Only p0-7 are possible as the second operand of cmpne (needed for fills).
4444+     SR.PPR3bRegs  = ComputeScavengeableRegisters (AArch64::PPR_3bRegClassID);
4445+     SR.GPRRegs  = ComputeScavengeableRegisters (AArch64::GPR64RegClassID);
4446+ 
4447+     EmergencyStackSlots SpillSlots;
4448+     for  (MachineBasicBlock &MBB : MF) {
4449+       //  In the case we had to spill a predicate (in the range p0-p7) to reload
4450+       //  a predicate (>= p8), additional spill/fill pseudos will be created.
4451+       //  These need an additional expansion pass. Note: There will only be at
4452+       //  most two expansion passes, as spilling/filling a predicate in the range
4453+       //  p0-p7 never requires spilling another predicate.
4454+       for  (int  Pass = 0 ; Pass < 2 ; Pass++) {
4455+         bool  HasPPRSpills =
4456+             expandSMEPPRToZPRSpillPseudos (MBB, TRI, SR, SpillSlots);
4457+         assert ((Pass == 0  || !HasPPRSpills) && " Did not expect PPR spills" 
4458+         if  (!HasPPRSpills)
4459+           break ;
4460+       }
4461+     }
4462+   }
4463+ 
41644464  MachineFrameInfo &MFI = MF.getFrameInfo ();
41654465
41664466  assert (getStackGrowthDirection () == TargetFrameLowering::StackGrowsDown &&
@@ -4170,7 +4470,6 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
41704470  int64_t  SVEStackSize =
41714471      assignSVEStackObjectOffsets (MFI, MinCSFrameIndex, MaxCSFrameIndex);
41724472
4173-   AArch64FunctionInfo *AFI = MF.getInfo <AArch64FunctionInfo>();
41744473  AFI->setStackSizeSVE (alignTo (SVEStackSize, 16U ));
41754474  AFI->setMinMaxSVECSFrameIndex (MinCSFrameIndex, MaxCSFrameIndex);
41764475
@@ -5204,9 +5503,13 @@ void AArch64FrameLowering::emitRemarks(
52045503
52055504          unsigned  RegTy = StackAccess::AccessType::GPR;
52065505          if  (MFI.getStackID (FrameIdx) == TargetStackID::ScalableVector) {
5207-             if  (AArch64::PPRRegClass.contains (MI.getOperand (0 ).getReg ()))
5506+             //  SPILL_PPR_TO_ZPR_SLOT_PSEUDO and FILL_PPR_FROM_ZPR_SLOT_PSEUDO
5507+             //  spill/fill the predicate as a data vector (so are an FPR acess).
5508+             if  (MI.getOpcode () != AArch64::SPILL_PPR_TO_ZPR_SLOT_PSEUDO &&
5509+                 MI.getOpcode () != AArch64::FILL_PPR_FROM_ZPR_SLOT_PSEUDO &&
5510+                 AArch64::PPRRegClass.contains (MI.getOperand (0 ).getReg ())) {
52085511              RegTy = StackAccess::PPR;
5209-             else 
5512+             }  else 
52105513              RegTy = StackAccess::FPR;
52115514          } else  if  (AArch64InstrInfo::isFpOrNEON (MI)) {
52125515            RegTy = StackAccess::FPR;
0 commit comments