From c15104f964cb8c103e8e4aede15e561a853e0947 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 10 Sep 2025 18:04:47 -0500 Subject: [PATCH 01/17] unpack packed instructions overlapped by MFMAs post-RA scheduling --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 60 ++++ llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 + llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 295 +++++++++++++++++- ...ck-non-coissue-insts-post-ra-scheduler.mir | 151 +++++++++ 4 files changed, 503 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 398c99b3bd127..8fce521da157e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6359,6 +6359,66 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return isImmOperandLegal(MI, OpIdx, *MO); } +bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { + bool IsGFX950Only = ST.hasGFX950Insts(); + bool IsGFX940Only = ST.hasGFX940Insts(); + + if (!IsGFX950Only && !IsGFX940Only) + return false; + + if (!isVALU(MI)) + return false; + + // V_COS, V_EXP, V_RCP, etc. + if (isTRANS(MI)) + return true; + + // DOT2, DOT2C, DOT4, etc. + if (isDOT(MI)) + return true; + + // MFMA, SMFMA + if (isMFMA(MI)) + return true; + + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + default: + return false; + } +} + void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const { unsigned Opc = MI.getOpcode(); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index f7dde2b90b68e..d0b49ffc19600 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1200,6 +1200,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { return isImmOperandLegal(MI.getDesc(), OpNo, MO); } + bool isNeverCoissue(MachineInstr &MI) const; + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 2c2ceedf8a2f6..cad096e0d2fcc 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -9,11 +9,19 @@ /// \file /// This pass performs the peephole optimizations before code emission. /// +/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16, +/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be +/// co-issued. This helps with overlapping MFMA and certain vector instructions +/// in machine schedules and is expected to improve performance. Only those +/// packed instructions are unpacked that are overlapped by the MFMA latency. +/// Rest should remain untouched. +/// TODO: Add support for F16 packed instructions //===----------------------------------------------------------------------===// #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/ADT/SetVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/BranchProbability.h" @@ -28,6 +36,7 @@ class SIPreEmitPeephole { private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; + MachineRegisterInfo *MRI; bool optimizeVccBranch(MachineInstr &MI) const; bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; @@ -39,6 +48,37 @@ class SIPreEmitPeephole { const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); + // Check if the machine instruction being processed is a supported packed + // instruction + bool isUnpackingSupportedInstr(MachineInstr &MI) const; + // Creates a list of packed instructions following an MFMA that are suitable + // for unpacking. + void createListOfPackedInstr(MachineInstr &BeginMI, + SetVector &InstrsToUnpack, + uint16_t NumMFMACycles); + // Identify register dependencies between those used by the MFMA + // instruction and the following packed instructions. Conservatively ensures + // that we do not incorrectly read/write registers. + bool hasReadWriteDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI); + // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. + // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this + // transformation. + void processF32Unpacking(MachineInstr &I); + // Insert appropriate unpacked instructions into the BB + void insertUnpackedF32MI(MachineInstr &I, bool IsFMA); + // Select corresponding unpacked instruction from packed instruction as input + uint16_t mapToUnpackedOpcode(MachineInstr &I); + // Creates the unpacked instruction to be inserted. Adds source modifiers to + // the unpacked instructions based on the source modifiers in the packed + // instruction + MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, + const DebugLoc &DL, + uint16_t UnpackedOpcode, bool IsHiBits, + bool IsFMA); + void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods, + unsigned NegModifier, unsigned OpSelModifier, + MachineOperand &SrcMO); public: bool run(MachineFunction &MF); @@ -274,11 +314,9 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, return false; if (IdxReg && I->modifiesRegister(IdxReg, TRI)) return false; - if (llvm::any_of(I->operands(), - [&MRI, this](const MachineOperand &MO) { - return MO.isReg() && - TRI->isVectorRegister(MRI, MO.getReg()); - })) { + if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) { + return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg()); + })) { // The only exception allowed here is another indirect vector move // with the same mode. if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write || @@ -417,6 +455,233 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } +bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_FMA_F32: + return true; + default: + return false; + } + llvm_unreachable("Fully covered switch"); +} + +bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI) { + for (const MachineOperand &Pred_Ops : PredMI.operands()) { + if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) + continue; + Register Pred_Reg = Pred_Ops.getReg(); + if (!Pred_Reg.isValid()) + continue; + for (const MachineOperand &Succ_Ops : SuccMI.operands()) { + if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) + continue; + Register Succ_Reg = Succ_Ops.getReg(); + if (!Succ_Reg.isValid()) + continue; + if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { + return true; + } + } + } + return false; +} + +uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { + unsigned Opcode = I.getOpcode(); + // Use 64 bit encoding to allow use of VOP3 instructions. + // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 + // e32 instructions are VOP2 and don't allow source modifiers + switch (Opcode) { + case AMDGPU::V_PK_ADD_F32: + return AMDGPU::V_ADD_F32_e64; + case AMDGPU::V_PK_MUL_F32: + return AMDGPU::V_MUL_F32_e64; + case AMDGPU::V_PK_FMA_F32: + return AMDGPU::V_FMA_F32_e64; + default: + return std::numeric_limits::max(); + } + llvm_unreachable("Fully covered switch"); +} + +void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI, + unsigned Src_Mods, + unsigned NegModifier, + unsigned OpSelModifier, + MachineOperand &SrcMO) { + unsigned New_Src_Mods = 0; + const TargetRegisterInfo *RI = SrcMO.getParent() + ->getParent() + ->getParent() + ->getSubtarget() + .getRegisterInfo(); + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // NEG_HI shares the same bit position with ABS. But packed instructions do + // not support ABS. Therefore, NEG_HI must be translated to NEG source + // modifier for the higher 32 bits. Unpacked VOP3 instructions do support + // ABS, therefore we need to explicitly add the NEG modifier if present in + // the packed instruction + if (Src_Mods & NegModifier) { + New_Src_Mods |= SISrcMods::NEG; + } + // Src modifiers. Only negative modifiers are added if needed. Unpacked + // operations do not have op_sel, therefore it must be handled explicitly as + // done below. Unpacked operations support abs, but packed instructions do + // not. Thus, abs is not handled. + NewMI.addImm(New_Src_Mods); + if (SrcMO.isImm()) { + NewMI.addImm(SrcMO.getImm()); + } else { + // If op_sel == 0, select register 0 of reg:sub0_sub1 + Register UnpackedSrcReg = (Src_Mods & OpSelModifier) + ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) + : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); + if (SrcMO.isReg() && SrcMO.isKill()) + NewMI.addReg(UnpackedSrcReg, RegState::Kill); + else + NewMI.addReg(UnpackedSrcReg); + } +} + +void SIPreEmitPeephole::createListOfPackedInstr( + MachineInstr &BeginMI, SetVector &InstrsToUnpack, + uint16_t NumMFMACycles) { + auto *BB = BeginMI.getParent(); + auto E = BB->end(); + int TotalCyclesBetweenCandidates = 0; + auto SchedModel = TII->getSchedModel(); + for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { + MachineInstr &Instr = *I; + const MCSchedClassDesc *InstrSchedClassDesc = + SchedModel.resolveSchedClass(&Instr); + TotalCyclesBetweenCandidates += + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + + if (Instr.isMetaInstruction()) + continue; + if (Instr.isTerminator()) + return; + if (TotalCyclesBetweenCandidates > NumMFMACycles) + return; + if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { + if (hasReadWriteDependencies(BeginMI, Instr)) + continue; + + // If it is a packed instruction, we should subtract it's latency from the + // overall latency calculation here, because the packed instruction will + // be removed and replaced by 2 unpacked instructions + TotalCyclesBetweenCandidates -= + SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + // We're adding 2 to account for the extra latency added by unpacking into + // 2 instructions. At the time of writing, the considered unpacked + // instructions have latency of 1. + // TODO: improve latency handling of possible inserted instructions + TotalCyclesBetweenCandidates += 2; + // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) { + InstrsToUnpack.insert(&Instr); + // } + } + } + return; +} + +void SIPreEmitPeephole::processF32Unpacking(MachineInstr &I) { + if (SIInstrInfo::modifiesModeRegister(I) || + I.modifiesRegister(AMDGPU::EXEC, TRI)) + return; + bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false; + insertUnpackedF32MI(I, IsFMA); + return; +} + +void SIPreEmitPeephole::insertUnpackedF32MI(MachineInstr &I, bool IsFMA) { + MachineBasicBlock &MBB = *I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = I.getOperand(0).getReg(); + + uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); + if (UnpackedOpcode == std::numeric_limits::max()) + return; + + MachineInstrBuilder Op0L_Op1L = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA); + if (I.getOperand(0).isUndef()) + Op0L_Op1L->getOperand(0).setIsUndef(); + + MachineInstrBuilder Op0H_Op1H = createUnpackedMI( + MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA); + + if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + } + if (I.getFlag(MachineInstr::MIFlag::FmContract)) { + Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); + } + if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) { + Op0L_Op1L.getInstr()->getOperand(0).setIsRenamable(true); + Op0H_Op1H.getInstr()->getOperand(0).setIsRenamable(true); + } + + I.eraseFromParent(); + return; +} + +MachineInstrBuilder +SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, + const DebugLoc &DL, uint16_t UnpackedOpcode, + bool IsHiBits, bool IsFMA) { + MachineOperand &DstMO = I.getOperand(0); + MachineOperand &SrcMO1 = I.getOperand(2); + MachineOperand &SrcMO2 = I.getOperand(4); + Register DstReg = DstMO.getReg(); + const TargetRegisterInfo *RI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + Register UnpackedDstReg = IsHiBits ? RI->getSubReg(DstReg, AMDGPU::sub1) + : RI->getSubReg(DstReg, AMDGPU::sub0); + + int ClampIdx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + int64_t ClampVal = I.getOperand(ClampIdx).getImm(); + int Src0_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + int Src1_modifiers_Idx = + AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + + unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); + unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); + // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. + unsigned New_Src0_Mods = 0; + unsigned New_Src1_Mods = 0; + + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + + MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); + NewMI.addDef(UnpackedDstReg); // vdst + addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1); + addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2); + + if (IsFMA) { + MachineOperand &SrcMO3 = I.getOperand(6); + int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( + I.getOpcode(), AMDGPU::OpName::src2_modifiers); + unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); + addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3); + } + NewMI.addImm(ClampVal); // clamp + // Packed instructions do not support output modifiers. safe to assign them 0 + // for this use case + NewMI.addImm(0); // omod + return NewMI; +} + PreservedAnalyses llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM) { @@ -430,6 +695,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); + MRI = &MF.getRegInfo(); bool Changed = false; MF.RenumberBlocks(); @@ -461,7 +727,21 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // and limit the distance to 20 instructions for compile time purposes. // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions // may be bundled with the instructions they modify. + // + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + uint16_t NumMFMACycles = 0; + auto SchedModel = TII->getSchedModel(); + SetVector InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (SIInstrInfo::isMFMA(MI)) { + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles); + } if (Count == Threshold) SetGPRMI = nullptr; else @@ -481,6 +761,11 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { else SetGPRMI = &MI; } + if (!InstrsToUnpack.empty()) { + for (MachineInstr *MI : InstrsToUnpack) { + processF32Unpacking(*MI); + } + } } return Changed; diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir new file mode 100644 index 0000000000000..532344ea9cbd5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -0,0 +1,151 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: test_pk_mul_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_pk_mul_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5'} + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_op_sel_hi_selection_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5'} + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: S_WAITCNT 49279 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 From fb05da7a164c5227f8cda6c7055ce879c98c7844 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 11 Sep 2025 09:32:08 -0500 Subject: [PATCH 02/17] format error fix and unpack candidate selection condition change --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 68 ++++++++++---------- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 2 +- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 5 +- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 8fce521da157e..2048f61e1486a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6383,40 +6383,40 @@ bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); switch (Opcode) { - case AMDGPU::V_CVT_PK_BF8_F32_e64: - case AMDGPU::V_CVT_PK_FP8_F32_e64: - case AMDGPU::V_MQSAD_PK_U16_U8_e64: - case AMDGPU::V_MQSAD_U32_U8_e64: - case AMDGPU::V_PK_ADD_F16: - case AMDGPU::V_PK_ADD_F32: - case AMDGPU::V_PK_ADD_I16: - case AMDGPU::V_PK_ADD_U16: - case AMDGPU::V_PK_ASHRREV_I16: - case AMDGPU::V_PK_FMA_F16: - case AMDGPU::V_PK_FMA_F32: - case AMDGPU::V_PK_FMAC_F16_e32: - case AMDGPU::V_PK_FMAC_F16_e64: - case AMDGPU::V_PK_LSHLREV_B16: - case AMDGPU::V_PK_LSHRREV_B16: - case AMDGPU::V_PK_MAD_I16: - case AMDGPU::V_PK_MAD_U16: - case AMDGPU::V_PK_MAX_F16: - case AMDGPU::V_PK_MAX_I16: - case AMDGPU::V_PK_MAX_U16: - case AMDGPU::V_PK_MIN_F16: - case AMDGPU::V_PK_MIN_I16: - case AMDGPU::V_PK_MIN_U16: - case AMDGPU::V_PK_MOV_B32: - case AMDGPU::V_PK_MUL_F16: - case AMDGPU::V_PK_MUL_F32: - case AMDGPU::V_PK_MUL_LO_U16: - case AMDGPU::V_PK_SUB_I16: - case AMDGPU::V_PK_SUB_U16: - case AMDGPU::V_QSAD_PK_U16_U8_e64: - return true; - default: - return false; - } + case AMDGPU::V_CVT_PK_BF8_F32_e64: + case AMDGPU::V_CVT_PK_FP8_F32_e64: + case AMDGPU::V_MQSAD_PK_U16_U8_e64: + case AMDGPU::V_MQSAD_U32_U8_e64: + case AMDGPU::V_PK_ADD_F16: + case AMDGPU::V_PK_ADD_F32: + case AMDGPU::V_PK_ADD_I16: + case AMDGPU::V_PK_ADD_U16: + case AMDGPU::V_PK_ASHRREV_I16: + case AMDGPU::V_PK_FMA_F16: + case AMDGPU::V_PK_FMA_F32: + case AMDGPU::V_PK_FMAC_F16_e32: + case AMDGPU::V_PK_FMAC_F16_e64: + case AMDGPU::V_PK_LSHLREV_B16: + case AMDGPU::V_PK_LSHRREV_B16: + case AMDGPU::V_PK_MAD_I16: + case AMDGPU::V_PK_MAD_U16: + case AMDGPU::V_PK_MAX_F16: + case AMDGPU::V_PK_MAX_I16: + case AMDGPU::V_PK_MAX_U16: + case AMDGPU::V_PK_MIN_F16: + case AMDGPU::V_PK_MIN_I16: + case AMDGPU::V_PK_MIN_U16: + case AMDGPU::V_PK_MOV_B32: + case AMDGPU::V_PK_MUL_F16: + case AMDGPU::V_PK_MUL_F32: + case AMDGPU::V_PK_MUL_LO_U16: + case AMDGPU::V_PK_SUB_I16: + case AMDGPU::V_PK_SUB_U16: + case AMDGPU::V_QSAD_PK_U16_U8_e64: + return true; + default: + return false; + } } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index d0b49ffc19600..2f512eac41911 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1201,7 +1201,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { } bool isNeverCoissue(MachineInstr &MI) const; - + /// Check if this immediate value can be used for AV_MOV_B64_IMM_PSEUDO. bool isLegalAV64PseudoImm(uint64_t Imm) const; diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index cad096e0d2fcc..e3d9ac7b031d2 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -582,9 +582,8 @@ void SIPreEmitPeephole::createListOfPackedInstr( // instructions have latency of 1. // TODO: improve latency handling of possible inserted instructions TotalCyclesBetweenCandidates += 2; - // if (!(TotalCyclesBetweenCandidates > NumMFMACycles)) { - InstrsToUnpack.insert(&Instr); - // } + if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) + InstrsToUnpack.insert(&Instr); } } return; From bbefcfb618ac571e7198b927f8dc1e91a3f37788 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 11 Sep 2025 10:34:29 -0500 Subject: [PATCH 03/17] code comments, variable and function re-naming, minor code optimizations --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 96 ++++++++++---------- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index e3d9ac7b031d2..caa16bfd573ea 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -53,9 +53,10 @@ class SIPreEmitPeephole { bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. - void createListOfPackedInstr(MachineInstr &BeginMI, - SetVector &InstrsToUnpack, - uint16_t NumMFMACycles); + void + selectSuitableInstrsForUnpacking(MachineInstr &BeginMI, + SetVector &InstrsToUnpack, + uint16_t NumMFMACycles); // Identify register dependencies between those used by the MFMA // instruction and the following packed instructions. Conservatively ensures // that we do not incorrectly read/write registers. @@ -64,10 +65,10 @@ class SIPreEmitPeephole { // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this // transformation. - void processF32Unpacking(MachineInstr &I); + void performF32Unpacking(MachineInstr &I); // Insert appropriate unpacked instructions into the BB void insertUnpackedF32MI(MachineInstr &I, bool IsFMA); - // Select corresponding unpacked instruction from packed instruction as input + // Select corresponding unpacked instruction from packed instruction as I uint16_t mapToUnpackedOpcode(MachineInstr &I); // Creates the unpacked instruction to be inserted. Adds source modifiers to // the unpacked instructions based on the source modifiers in the packed @@ -76,7 +77,9 @@ class SIPreEmitPeephole { const DebugLoc &DL, uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA); - void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods, + // process operands/source modifiers from packed instructions and insert the + // appropriate source modifers and operands into the unpacked instructions + void addOperandAndMods(MachineInstrBuilder NewMI, unsigned Src_Mods, unsigned NegModifier, unsigned OpSelModifier, MachineOperand &SrcMO); @@ -470,21 +473,20 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI, const MachineInstr &SuccMI) { - for (const MachineOperand &Pred_Ops : PredMI.operands()) { - if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) + for (const MachineOperand &PredOps : PredMI.operands()) { + if (!PredOps.isReg() || !PredOps.isDef()) continue; - Register Pred_Reg = Pred_Ops.getReg(); - if (!Pred_Reg.isValid()) + Register PredReg = PredOps.getReg(); + if (!PredReg.isValid()) continue; - for (const MachineOperand &Succ_Ops : SuccMI.operands()) { - if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) + for (const MachineOperand &SuccOps : SuccMI.operands()) { + if (!SuccOps.isReg() || !SuccOps.isDef()) continue; - Register Succ_Reg = Succ_Ops.getReg(); - if (!Succ_Reg.isValid()) + Register SuccReg = SuccOps.getReg(); + if (!SuccReg.isValid()) continue; - if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { + if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg)) return true; - } } } return false; @@ -508,12 +510,12 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { llvm_unreachable("Fully covered switch"); } -void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI, +void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, unsigned Src_Mods, unsigned NegModifier, unsigned OpSelModifier, MachineOperand &SrcMO) { - unsigned New_Src_Mods = 0; + unsigned NewSrcMods = 0; const TargetRegisterInfo *RI = SrcMO.getParent() ->getParent() ->getParent() @@ -526,29 +528,28 @@ void SIPreEmitPeephole::addOperandandMods(MachineInstrBuilder NewMI, // modifier for the higher 32 bits. Unpacked VOP3 instructions do support // ABS, therefore we need to explicitly add the NEG modifier if present in // the packed instruction - if (Src_Mods & NegModifier) { - New_Src_Mods |= SISrcMods::NEG; - } + if (Src_Mods & NegModifier) + NewSrcMods |= SISrcMods::NEG; // Src modifiers. Only negative modifiers are added if needed. Unpacked // operations do not have op_sel, therefore it must be handled explicitly as // done below. Unpacked operations support abs, but packed instructions do // not. Thus, abs is not handled. - NewMI.addImm(New_Src_Mods); + NewMI.addImm(NewSrcMods); if (SrcMO.isImm()) { NewMI.addImm(SrcMO.getImm()); - } else { - // If op_sel == 0, select register 0 of reg:sub0_sub1 - Register UnpackedSrcReg = (Src_Mods & OpSelModifier) - ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) - : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); - if (SrcMO.isReg() && SrcMO.isKill()) - NewMI.addReg(UnpackedSrcReg, RegState::Kill); - else - NewMI.addReg(UnpackedSrcReg); + return; } + // If op_sel == 0, select register 0 of reg:sub0_sub1 + Register UnpackedSrcReg = (Src_Mods & OpSelModifier) + ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) + : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); + if (SrcMO.isReg() && SrcMO.isKill()) + NewMI.addReg(UnpackedSrcReg, RegState::Kill); + else + NewMI.addReg(UnpackedSrcReg); } -void SIPreEmitPeephole::createListOfPackedInstr( +void SIPreEmitPeephole::selectSuitableInstrsForUnpacking( MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); @@ -589,7 +590,7 @@ void SIPreEmitPeephole::createListOfPackedInstr( return; } -void SIPreEmitPeephole::processF32Unpacking(MachineInstr &I) { +void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { if (SIInstrInfo::modifiesModeRegister(I) || I.modifiesRegister(AMDGPU::EXEC, TRI)) return; @@ -648,31 +649,31 @@ SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, int ClampIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); int64_t ClampVal = I.getOperand(ClampIdx).getImm(); - int Src0_modifiers_Idx = + int Src0ModifiersIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); - int Src1_modifiers_Idx = + int Src1ModifiersIdx = AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); - unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); - unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); + unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm(); + unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm(); // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - unsigned New_Src0_Mods = 0; - unsigned New_Src1_Mods = 0; + unsigned NewSrc0Mods = 0; + unsigned NewSrc1Mods = 0; unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1); - addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2); + addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, SrcMO1); + addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, SrcMO2); if (IsFMA) { MachineOperand &SrcMO3 = I.getOperand(6); - int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( + int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx( I.getOpcode(), AMDGPU::OpName::src2_modifiers); - unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); - addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3); + unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm(); + addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -739,7 +740,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { SchedModel.resolveSchedClass(&MI); NumMFMACycles = SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles); + selectSuitableInstrsForUnpacking(MI, InstrsToUnpack, NumMFMACycles); } if (Count == Threshold) SetGPRMI = nullptr; @@ -761,9 +762,8 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { SetGPRMI = &MI; } if (!InstrsToUnpack.empty()) { - for (MachineInstr *MI : InstrsToUnpack) { - processF32Unpacking(*MI); - } + for (MachineInstr *MI : InstrsToUnpack) + performF32Unpacking(*MI); } } From f71631d5a0b19c232a69c78ee744e28e5c4ef06e Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 11 Sep 2025 12:25:39 -0500 Subject: [PATCH 04/17] MIR tests for pk_add and pk_fma --- ...ck-non-coissue-insts-post-ra-scheduler.mir | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 532344ea9cbd5..3a6afc108ff7e 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -149,3 +149,75 @@ body: | $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 + +... +--- +name: test_pk_add_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + + +... +--- +name: test_pk_fma_unpacking_f32 +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 From 6f5c6a529603fb227aee49940c628bca38516e56 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 12 Sep 2025 13:41:49 -0500 Subject: [PATCH 05/17] check if unpacking can introduce dependencies, improve candidate selection, code cleanup --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 233 ++++++++++-------- ...ck-non-coissue-insts-post-ra-scheduler.mir | 117 ++++++++- 2 files changed, 247 insertions(+), 103 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index caa16bfd573ea..86290fa8e01e1 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -36,7 +36,6 @@ class SIPreEmitPeephole { private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; - MachineRegisterInfo *MRI; bool optimizeVccBranch(MachineInstr &MI) const; bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; @@ -53,33 +52,36 @@ class SIPreEmitPeephole { bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. - void - selectSuitableInstrsForUnpacking(MachineInstr &BeginMI, - SetVector &InstrsToUnpack, - uint16_t NumMFMACycles); + void collectUnpackingCandidates(MachineInstr &BeginMI, + SetVector &InstrsToUnpack, + uint16_t NumMFMACycles); // Identify register dependencies between those used by the MFMA // instruction and the following packed instructions. Conservatively ensures // that we do not incorrectly read/write registers. - bool hasReadWriteDependencies(const MachineInstr &PredMI, - const MachineInstr &SuccMI); - // Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. - // Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this - // transformation. + bool hasRWDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI); + // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1] + // op_sel_hi:[0,0,0] + // ==> + // v_fma_f32 v0, v1, v3, v3 + // v_fma_f32 v1, v0, v2, v2 + // here, we have overwritten v0 before we use it. This function checks if + // unpacking can lead to such a situation + bool canUnpackingIntroduceDependencies(const MachineInstr &MI); + // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and + // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for + // this transformation. void performF32Unpacking(MachineInstr &I); - // Insert appropriate unpacked instructions into the BB - void insertUnpackedF32MI(MachineInstr &I, bool IsFMA); // Select corresponding unpacked instruction from packed instruction as I uint16_t mapToUnpackedOpcode(MachineInstr &I); // Creates the unpacked instruction to be inserted. Adds source modifiers to // the unpacked instructions based on the source modifiers in the packed // instruction - MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, - const DebugLoc &DL, - uint16_t UnpackedOpcode, bool IsHiBits, - bool IsFMA); + MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode, + bool IsHiBits); // process operands/source modifiers from packed instructions and insert the // appropriate source modifers and operands into the unpacked instructions - void addOperandAndMods(MachineInstrBuilder NewMI, unsigned Src_Mods, + void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods, unsigned NegModifier, unsigned OpSelModifier, MachineOperand &SrcMO); @@ -471,8 +473,8 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { llvm_unreachable("Fully covered switch"); } -bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI, - const MachineInstr &SuccMI) { +bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI, + const MachineInstr &SuccMI) { for (const MachineOperand &PredOps : PredMI.operands()) { if (!PredOps.isReg() || !PredOps.isDef()) continue; @@ -480,7 +482,7 @@ bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI, if (!PredReg.isValid()) continue; for (const MachineOperand &SuccOps : SuccMI.operands()) { - if (!SuccOps.isReg() || !SuccOps.isDef()) + if (!SuccOps.isReg()) continue; Register SuccReg = SuccOps.getReg(); if (!SuccReg.isValid()) @@ -492,11 +494,54 @@ bool SIPreEmitPeephole::hasReadWriteDependencies(const MachineInstr &PredMI, return false; } +bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( + const MachineInstr &MI) { + unsigned OpCode = MI.getOpcode(); + bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false; + MachineOperand DstMO = MI.getOperand(0); + Register DstReg = DstMO.getReg(); + Register SrcReg0 = MI.getOperand(2).getReg(); + Register SrcReg1 = MI.getOperand(4).getReg(); + + Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); + int Src0ModifiersIdx = + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); + int Src1ModifiersIdx = + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers); + unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm(); + unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm(); + + Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) + : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) + : TRI->getSubReg(SrcReg1, AMDGPU::sub0); + if (UnpackedDstReg == HiSrc0Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg) || + UnpackedDstReg == HiSrc1Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) + return true; + if (IsFMA) { + int Src2ModifiersIdx = + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); + unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm(); + Register SrcReg2 = MI.getOperand(6).getReg(); + Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) + : TRI->getSubReg(SrcReg2, AMDGPU::sub0); + if (UnpackedDstReg == HiSrc2Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + return true; + } + return false; +} + uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { unsigned Opcode = I.getOpcode(); // Use 64 bit encoding to allow use of VOP3 instructions. - // VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 - // e32 instructions are VOP2 and don't allow source modifiers + // VOP3 e64 instructions allow source modifiers + // e32 instructions don't allow source modifiers switch (Opcode) { case AMDGPU::V_PK_ADD_F32: return AMDGPU::V_ADD_F32_e64; @@ -511,16 +556,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { } void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, - unsigned Src_Mods, + unsigned SrcMods, unsigned NegModifier, unsigned OpSelModifier, MachineOperand &SrcMO) { unsigned NewSrcMods = 0; - const TargetRegisterInfo *RI = SrcMO.getParent() - ->getParent() - ->getParent() - ->getSubtarget() - .getRegisterInfo(); // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit // lane. // NEG_HI shares the same bit position with ABS. But packed instructions do @@ -528,7 +568,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, // modifier for the higher 32 bits. Unpacked VOP3 instructions do support // ABS, therefore we need to explicitly add the NEG modifier if present in // the packed instruction - if (Src_Mods & NegModifier) + if (SrcMods & NegModifier) NewSrcMods |= SISrcMods::NEG; // Src modifiers. Only negative modifiers are added if needed. Unpacked // operations do not have op_sel, therefore it must be handled explicitly as @@ -540,16 +580,16 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, return; } // If op_sel == 0, select register 0 of reg:sub0_sub1 - Register UnpackedSrcReg = (Src_Mods & OpSelModifier) - ? RI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) - : RI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); + Register UnpackedSrcReg = (SrcMods & OpSelModifier) + ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) + : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); if (SrcMO.isReg() && SrcMO.isKill()) NewMI.addReg(UnpackedSrcReg, RegState::Kill); else NewMI.addReg(UnpackedSrcReg); } -void SIPreEmitPeephole::selectSuitableInstrsForUnpacking( +void SIPreEmitPeephole::collectUnpackingCandidates( MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles) { auto *BB = BeginMI.getParent(); @@ -558,21 +598,25 @@ void SIPreEmitPeephole::selectSuitableInstrsForUnpacking( auto SchedModel = TII->getSchedModel(); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; + if (Instr.isMetaInstruction()) + continue; + if (Instr.isTerminator()) + return; + if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) + return; const MCSchedClassDesc *InstrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); TotalCyclesBetweenCandidates += SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; - if (Instr.isMetaInstruction()) - continue; - if (Instr.isTerminator()) - return; if (TotalCyclesBetweenCandidates > NumMFMACycles) return; - if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { - if (hasReadWriteDependencies(BeginMI, Instr)) + if (isUnpackingSupportedInstr(Instr)) { + assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued."); + if (hasRWDependencies(BeginMI, Instr)) + return; + if (canUnpackingIntroduceDependencies(Instr)) continue; - // If it is a packed instruction, we should subtract it's latency from the // overall latency calculation here, because the packed instruction will // be removed and replaced by 2 unpacked instructions @@ -591,74 +635,65 @@ void SIPreEmitPeephole::selectSuitableInstrsForUnpacking( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - if (SIInstrInfo::modifiesModeRegister(I) || - I.modifiesRegister(AMDGPU::EXEC, TRI)) - return; - bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false; - insertUnpackedF32MI(I, IsFMA); - return; -} - -void SIPreEmitPeephole::insertUnpackedF32MI(MachineInstr &I, bool IsFMA) { MachineBasicBlock &MBB = *I.getParent(); - const DebugLoc &DL = I.getDebugLoc(); - Register DstReg = I.getOperand(0).getReg(); + MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); if (UnpackedOpcode == std::numeric_limits::max()) return; - MachineInstrBuilder Op0L_Op1L = createUnpackedMI( - MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA); - if (I.getOperand(0).isUndef()) - Op0L_Op1L->getOperand(0).setIsUndef(); + MachineInstrBuilder Op0LOp1L = + createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); + MachineOperand LoDstOp = Op0LOp1L->getOperand(0); + + if (DstOp.isUndef()) + LoDstOp.setIsUndef(); - MachineInstrBuilder Op0H_Op1H = createUnpackedMI( - MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA); + MachineInstrBuilder Op0HOp1H = + createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true); + MachineOperand HiDstOp = Op0HOp1H->getOperand(0); if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { - Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); - Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0LOp1L->setFlag(MachineInstr::MIFlag::NoFPExcept); + Op0HOp1H->setFlag(MachineInstr::MIFlag::NoFPExcept); } if (I.getFlag(MachineInstr::MIFlag::FmContract)) { - Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); - Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); + Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract); + Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract); } if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) { - Op0L_Op1L.getInstr()->getOperand(0).setIsRenamable(true); - Op0H_Op1H.getInstr()->getOperand(0).setIsRenamable(true); + LoDstOp.setIsRenamable(true); + HiDstOp.setIsRenamable(true); } I.eraseFromParent(); return; } -MachineInstrBuilder -SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, - const DebugLoc &DL, uint16_t UnpackedOpcode, - bool IsHiBits, bool IsFMA) { +MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, + uint16_t UnpackedOpcode, + bool IsHiBits) { + MachineBasicBlock &MBB = *I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); MachineOperand &DstMO = I.getOperand(0); MachineOperand &SrcMO1 = I.getOperand(2); MachineOperand &SrcMO2 = I.getOperand(4); Register DstReg = DstMO.getReg(); - const TargetRegisterInfo *RI = - MBB.getParent()->getSubtarget().getRegisterInfo(); - Register UnpackedDstReg = IsHiBits ? RI->getSubReg(DstReg, AMDGPU::sub1) - : RI->getSubReg(DstReg, AMDGPU::sub0); + unsigned OpCode = I.getOpcode(); + Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) + : TRI->getSubReg(DstReg, AMDGPU::sub0); - int ClampIdx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); + bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false; + int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp); int64_t ClampVal = I.getOperand(ClampIdx).getImm(); int Src0ModifiersIdx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); int Src1ModifiersIdx = - AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers); unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm(); unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm(); // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - unsigned NewSrc0Mods = 0; - unsigned NewSrc1Mods = 0; unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; @@ -670,8 +705,8 @@ SIPreEmitPeephole::createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, if (IsFMA) { MachineOperand &SrcMO3 = I.getOperand(6); - int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx( - I.getOpcode(), AMDGPU::OpName::src2_modifiers); + int Src2ModifiersIdx = + AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm(); addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3); } @@ -695,12 +730,33 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MRI = &MF.getRegInfo(); bool Changed = false; MF.RenumberBlocks(); for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + uint16_t NumMFMACycles = 0; + auto SchedModel = TII->getSchedModel(); + SetVector InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (SIInstrInfo::isMFMA(MI)) { + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + } + if (!InstrsToUnpack.empty()) { + for (MachineInstr *MI : InstrsToUnpack) { + if (!SIInstrInfo::modifiesModeRegister(*MI) && + !MI->modifiesRegister(AMDGPU::EXEC, TRI)) + performF32Unpacking(*MI); + } + } + MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); // Check first terminator for branches to optimize if (TermI != MBB.end()) { @@ -728,20 +784,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions // may be bundled with the instructions they modify. // - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA - uint16_t NumMFMACycles = 0; - auto SchedModel = TII->getSchedModel(); - SetVector InstrsToUnpack; - for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (SIInstrInfo::isMFMA(MI)) { - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - selectSuitableInstrsForUnpacking(MI, InstrsToUnpack, NumMFMACycles); - } if (Count == Threshold) SetGPRMI = nullptr; else @@ -761,10 +804,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { else SetGPRMI = &MI; } - if (!InstrsToUnpack.empty()) { - for (MachineInstr *MI : InstrsToUnpack) - performF32Unpacking(*MI); - } } return Changed; diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 3a6afc108ff7e..9e520b54b3b5d 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -29,8 +29,8 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -79,8 +79,8 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -129,8 +129,8 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: renamable $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -161,6 +161,44 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_pk_add_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GCN-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GCN-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GCN-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec @@ -206,6 +244,25 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_pk_fma_unpacking_f32 + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -221,3 +278,51 @@ body: | $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 + +... +--- +name: test_unpacking_does_not_introduce_rw_dependency +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 From ea85eb788535ce08ac4940f1d59e6619d54cdff0 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 12 Sep 2025 13:47:37 -0500 Subject: [PATCH 06/17] update MIR test cmd --- .../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 9e520b54b3b5d..d45658fb415ea 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -march=amdgcn -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s --- name: test_pk_mul_unpacking_f32 From 4995f00271cd1e1fb5233102e271b9f3bf4d0a90 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 12 Sep 2025 17:34:31 -0500 Subject: [PATCH 07/17] remove use of hard coded operand idx --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 32 +++++++++----------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 86290fa8e01e1..5c353481e6134 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -83,7 +83,7 @@ class SIPreEmitPeephole { // appropriate source modifers and operands into the unpacked instructions void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods, unsigned NegModifier, unsigned OpSelModifier, - MachineOperand &SrcMO); + const MachineOperand &SrcMO); public: bool run(MachineFunction &MF); @@ -497,11 +497,11 @@ bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI, bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); - bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false; + bool IsFMA = OpCode == AMDGPU::V_PK_FMA_F32; MachineOperand DstMO = MI.getOperand(0); Register DstReg = DstMO.getReg(); - Register SrcReg0 = MI.getOperand(2).getReg(); - Register SrcReg1 = MI.getOperand(4).getReg(); + Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg(); + Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg(); Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); int Src0ModifiersIdx = @@ -522,7 +522,7 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( UnpackedDstReg == HiSrc1Reg || TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) return true; - if (IsFMA) { + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm(); @@ -559,7 +559,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods, unsigned NegModifier, unsigned OpSelModifier, - MachineOperand &SrcMO) { + const MachineOperand &SrcMO) { unsigned NewSrcMods = 0; // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit // lane. @@ -616,7 +616,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (hasRWDependencies(BeginMI, Instr)) return; if (canUnpackingIntroduceDependencies(Instr)) - continue; + return; // If it is a packed instruction, we should subtract it's latency from the // overall latency calculation here, because the packed instruction will // be removed and replaced by 2 unpacked instructions @@ -635,7 +635,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates( } void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { - MachineBasicBlock &MBB = *I.getParent(); MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); @@ -661,7 +660,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract); Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract); } - if (I.getOperand(0).getReg().isPhysical() && I.getOperand(0).isRenamable()) { + if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) { LoDstOp.setIsRenamable(true); HiDstOp.setIsRenamable(true); } @@ -676,14 +675,13 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); MachineOperand &DstMO = I.getOperand(0); - MachineOperand &SrcMO1 = I.getOperand(2); - MachineOperand &SrcMO2 = I.getOperand(4); + const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); + const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); Register DstReg = DstMO.getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) : TRI->getSubReg(DstReg, AMDGPU::sub0); - bool IsFMA = (OpCode == AMDGPU::V_PK_FMA_F32) ? true : false; int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp); int64_t ClampVal = I.getOperand(ClampIdx).getImm(); int Src0ModifiersIdx = @@ -700,15 +698,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, SrcMO1); - addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, SrcMO2); + addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, *SrcMO1); + addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, *SrcMO2); - if (IsFMA) { - MachineOperand &SrcMO3 = I.getOperand(6); + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { + const MachineOperand *SrcMO3 = TII->getNamedOperand(I, AMDGPU::OpName::src2); int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm(); - addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, SrcMO3); + addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, *SrcMO3); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 From 05d0c75e120ce172775950d71da86d272ebd748d Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Fri, 12 Sep 2025 17:41:16 -0500 Subject: [PATCH 08/17] update test --- .../AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index d45658fb415ea..0c1e063255cdf 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -307,8 +307,7 @@ body: | ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec From 88e3c6d639a066e1cbfeea5fdce6873b73d3683a Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 16 Sep 2025 15:12:34 -0500 Subject: [PATCH 09/17] direct and transitive dependency on MFMA def, reg kill logic, new tests --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 205 ++++++++++-------- ...ck-non-coissue-insts-post-ra-scheduler.mir | 170 ++++++++++++++- 2 files changed, 275 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 5c353481e6134..d4db028dc1758 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -55,11 +55,6 @@ class SIPreEmitPeephole { void collectUnpackingCandidates(MachineInstr &BeginMI, SetVector &InstrsToUnpack, uint16_t NumMFMACycles); - // Identify register dependencies between those used by the MFMA - // instruction and the following packed instructions. Conservatively ensures - // that we do not incorrectly read/write registers. - bool hasRWDependencies(const MachineInstr &PredMI, - const MachineInstr &SuccMI); // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1] // op_sel_hi:[0,0,0] // ==> @@ -82,8 +77,7 @@ class SIPreEmitPeephole { // process operands/source modifiers from packed instructions and insert the // appropriate source modifers and operands into the unpacked instructions void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods, - unsigned NegModifier, unsigned OpSelModifier, - const MachineOperand &SrcMO); + bool IsHiBits, const MachineOperand &SrcMO); public: bool run(MachineFunction &MF); @@ -460,6 +454,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } +// If support is extended to new operations, add tests in +// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -473,36 +469,10 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { llvm_unreachable("Fully covered switch"); } -bool SIPreEmitPeephole::hasRWDependencies(const MachineInstr &PredMI, - const MachineInstr &SuccMI) { - for (const MachineOperand &PredOps : PredMI.operands()) { - if (!PredOps.isReg() || !PredOps.isDef()) - continue; - Register PredReg = PredOps.getReg(); - if (!PredReg.isValid()) - continue; - for (const MachineOperand &SuccOps : SuccMI.operands()) { - if (!SuccOps.isReg()) - continue; - Register SuccReg = SuccOps.getReg(); - if (!SuccReg.isValid()) - continue; - if ((PredReg == SuccReg) || TRI->regsOverlap(PredReg, SuccReg)) - return true; - } - } - return false; -} - bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); - bool IsFMA = OpCode == AMDGPU::V_PK_FMA_F32; - MachineOperand DstMO = MI.getOperand(0); - Register DstReg = DstMO.getReg(); - Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg(); - Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg(); - + Register DstReg = MI.getOperand(0).getReg(); Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); int Src0ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); @@ -511,28 +481,40 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm(); unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm(); - Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) - : TRI->getSubReg(SrcReg0, AMDGPU::sub0); - Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) - : TRI->getSubReg(SrcReg1, AMDGPU::sub0); - if (UnpackedDstReg == HiSrc0Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg) || - UnpackedDstReg == HiSrc1Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) - return true; + if (TII->getNamedOperand(MI, AMDGPU::OpName::src0)->isReg()) { + Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg(); + Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) + : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + if (UnpackedDstReg == HiSrc0Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) + return true; + } + + if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isReg()) { + Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg(); + Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) + : TRI->getSubReg(SrcReg1, AMDGPU::sub0); + if (UnpackedDstReg == HiSrc1Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) + return true; + } + if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm(); - Register SrcReg2 = MI.getOperand(6).getReg(); - Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) - ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) - : TRI->getSubReg(SrcReg2, AMDGPU::sub0); - if (UnpackedDstReg == HiSrc2Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) - return true; + if (TII->getNamedOperand(MI, AMDGPU::OpName::src2)->isReg()) { + Register SrcReg2 = + TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); + Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) + ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) + : TRI->getSubReg(SrcReg2, AMDGPU::sub0); + if (UnpackedDstReg == HiSrc2Reg || + TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + return true; + } } return false; } @@ -556,11 +538,13 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { } void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, - unsigned SrcMods, - unsigned NegModifier, - unsigned OpSelModifier, + unsigned SrcMods, bool IsHiBits, const MachineOperand &SrcMO) { unsigned NewSrcMods = 0; + unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; + unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; + // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done + // for ABS modifiers. // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit // lane. // NEG_HI shares the same bit position with ABS. But packed instructions do @@ -583,10 +567,30 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, Register UnpackedSrcReg = (SrcMods & OpSelModifier) ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); - if (SrcMO.isReg() && SrcMO.isKill()) - NewMI.addReg(UnpackedSrcReg, RegState::Kill); - else - NewMI.addReg(UnpackedSrcReg); + + MachineOperand UnpackedSrcMO = + MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false); + if (SrcMO.isKill()) { + // For each unpacked instruction, mark its source registers as killed if the + // corresponding source register in the original packed instruction was + // marked as killed. + // + // Exception: + // If the op_sel and op_sel_hi modifiers require both unpacked instructions + // to use the same register (e.g., due to overlapping access to low/high + // bits of the same packed register), then only the *second* (latter) + // instruction should mark the register as killed. This is because the + // second instruction handles the higher bits and is effectively the last + // user of the full register pair. + + bool OpSel = SrcMods & SISrcMods::OP_SEL_0; + bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1; + bool KillState = true; + if ((OpSel == OpSelHi) && !IsHiBits) + KillState = false; + UnpackedSrcMO.setIsKill(KillState); + } + NewMI.add(UnpackedSrcMO); } void SIPreEmitPeephole::collectUnpackingCandidates( @@ -596,6 +600,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates( auto E = BB->end(); int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); + SetVector RegsOverlappedByMFMADef; + RegsOverlappedByMFMADef.insert(BeginMI.getOperand(0).getReg()); for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; if (Instr.isMetaInstruction()) @@ -611,10 +617,25 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (TotalCyclesBetweenCandidates > NumMFMACycles) return; + // Identify register dependencies between those used by the MFMA + // instruction and the following packed instructions. Also checks for + // transitive dependencies between the MFMA def and candidate instruction + // def and uses. Conservatively ensures that we do not incorrectly + // read/write registers. + for (const MachineOperand &InstrMO : Instr.operands()) { + if (InstrMO.isReg() && !InstrMO.isDef()) { + for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) { + if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) { + if (isUnpackingSupportedInstr(Instr)) + return; + RegsOverlappedByMFMADef.insert(Instr.getOperand(0).getReg()); + break; + } + } + } + } if (isUnpackingSupportedInstr(Instr)) { assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued."); - if (hasRWDependencies(BeginMI, Instr)) - return; if (canUnpackingIntroduceDependencies(Instr)) return; // If it is a packed instruction, we should subtract it's latency from the @@ -674,10 +695,9 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, bool IsHiBits) { MachineBasicBlock &MBB = *I.getParent(); const DebugLoc &DL = I.getDebugLoc(); - MachineOperand &DstMO = I.getOperand(0); const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0); const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1); - Register DstReg = DstMO.getReg(); + Register DstReg = I.getOperand(0).getReg(); unsigned OpCode = I.getOpcode(); Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) : TRI->getSubReg(DstReg, AMDGPU::sub0); @@ -691,22 +711,19 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm(); unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm(); - // Packed instructions (VOP3P) do not support abs. It is okay to ignore them. - - unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; - unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst - addOperandAndMods(NewMI, Src0Mods, NegModifier, OpSelModifier, *SrcMO1); - addOperandAndMods(NewMI, Src1Mods, NegModifier, OpSelModifier, *SrcMO2); + addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1); + addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2); if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - const MachineOperand *SrcMO3 = TII->getNamedOperand(I, AMDGPU::OpName::src2); + const MachineOperand *SrcMO3 = + TII->getNamedOperand(I, AMDGPU::OpName::src2); int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm(); - addOperandAndMods(NewMI, Src2Mods, NegModifier, OpSelModifier, *SrcMO3); + addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); } NewMI.addImm(ClampVal); // clamp // Packed instructions do not support output modifiers. safe to assign them 0 @@ -733,28 +750,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { MF.RenumberBlocks(); for (MachineBasicBlock &MBB : MF) { - // Unpack packed instructions overlapped by MFMAs. This allows the compiler - // to co-issue unpacked instructions with MFMA - uint16_t NumMFMACycles = 0; - auto SchedModel = TII->getSchedModel(); - SetVector InstrsToUnpack; - for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (SIInstrInfo::isMFMA(MI)) { - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); - } - } - if (!InstrsToUnpack.empty()) { - for (MachineInstr *MI : InstrsToUnpack) { - if (!SIInstrInfo::modifiesModeRegister(*MI) && - !MI->modifiesRegister(AMDGPU::EXEC, TRI)) - performF32Unpacking(*MI); - } - } - MachineBasicBlock::iterator TermI = MBB.getFirstTerminator(); // Check first terminator for branches to optimize if (TermI != MBB.end()) { @@ -804,5 +799,29 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { } } + for (MachineBasicBlock &MBB : MF) { + // Unpack packed instructions overlapped by MFMAs. This allows the compiler + // to co-issue unpacked instructions with MFMA + uint16_t NumMFMACycles = 0; + auto SchedModel = TII->getSchedModel(); + SetVector InstrsToUnpack; + for (auto &MI : make_early_inc_range(MBB.instrs())) { + if (SIInstrInfo::isMFMA(MI)) { + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); + } + } + if (!InstrsToUnpack.empty()) { + for (MachineInstr *MI : InstrsToUnpack) { + if (!SIInstrInfo::modifiesModeRegister(*MI) && + !MI->modifiesRegister(AMDGPU::EXEC, TRI)) + performF32Unpacking(*MI); + } + } + } + return Changed; } diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 0c1e063255cdf..6a9b406e390c0 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -79,7 +79,7 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 @@ -129,7 +129,7 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 @@ -260,7 +260,7 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 @@ -306,8 +306,9 @@ body: | ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec @@ -322,6 +323,161 @@ body: | early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_opcodes_not_supported_for_unpacking_are_skipped +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GCN-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + + $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + + $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + + S_ENDPGM 0 + +... +--- +name: test_opsel_register_is_correctly_marked_as_killed +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_inst_dependent_on_mfma_are_not_unpacked +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + ; GCN-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GCN: liveins: $sgpr4_sgpr5 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GCN-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GCN-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GCN-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GCN-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_ENDPGM 0 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 From 56b505fdefbac0598f6735ba53b2505814061e11 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Tue, 16 Sep 2025 15:23:43 -0500 Subject: [PATCH 10/17] update condition for MFMA dependence check --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index d4db028dc1758..6d94476205b78 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -623,7 +623,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( // def and uses. Conservatively ensures that we do not incorrectly // read/write registers. for (const MachineOperand &InstrMO : Instr.operands()) { - if (InstrMO.isReg() && !InstrMO.isDef()) { + if (InstrMO.isReg()) { for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) { if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) { if (isUnpackingSupportedInstr(Instr)) From 3ecc5da732faec35795361875bef14c0a440ea8f Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 17 Sep 2025 11:26:17 -0500 Subject: [PATCH 11/17] update code comments --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 6d94476205b78..62d100cef64ec 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -473,6 +473,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); + // Only the first register in the register pair needs to be checked due to the + // unpacking order. Packed instructions are unpacked such that the lower 32 + // bits (i.e., the first register in the pair) are written first. This can + // introduce dependencies if the first register is written in one instruction + // and then read as part of the higher 32 bits in the subsequent instruction. + // Such scenarios can arise due to specific combinations of op_sel and + // op_sel_hi modifiers. Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); int Src0ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); @@ -486,6 +493,8 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) : TRI->getSubReg(SrcReg0, AMDGPU::sub0); + // Check if the register selected by op_sel_hi is the same as the first + // register in the destination register pair if (UnpackedDstReg == HiSrc0Reg || TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) return true; @@ -501,6 +510,7 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( return true; } + // Applicable for packed instructions with 3 source operands, such as V_PK_FMA if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { int Src2ModifiersIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); @@ -549,15 +559,14 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, // lane. // NEG_HI shares the same bit position with ABS. But packed instructions do // not support ABS. Therefore, NEG_HI must be translated to NEG source - // modifier for the higher 32 bits. Unpacked VOP3 instructions do support - // ABS, therefore we need to explicitly add the NEG modifier if present in - // the packed instruction + // modifier for the higher 32 bits. Unpacked VOP3 instructions support + // ABS, but do not support NEG_HI. Therefore we need to explicitly add the + // NEG modifier if present in the packed instruction if (SrcMods & NegModifier) NewSrcMods |= SISrcMods::NEG; // Src modifiers. Only negative modifiers are added if needed. Unpacked // operations do not have op_sel, therefore it must be handled explicitly as - // done below. Unpacked operations support abs, but packed instructions do - // not. Thus, abs is not handled. + // done below. NewMI.addImm(NewSrcMods); if (SrcMO.isImm()) { NewMI.addImm(SrcMO.getImm()); @@ -799,6 +808,8 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { } } + // TODO: fold this into previous block, if possible. Evaluate and handle any + // side effects. for (MachineBasicBlock &MBB : MF) { // Unpack packed instructions overlapped by MFMAs. This allows the compiler // to co-issue unpacked instructions with MFMA From 48cf50cbea2b96851b8eb130a98b35674c4c0621 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Wed, 17 Sep 2025 21:18:25 -0500 Subject: [PATCH 12/17] update return conditions, reduce LOC --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 131 +++++++++---------- 1 file changed, 60 insertions(+), 71 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 62d100cef64ec..b2f7780e1e2f9 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -48,7 +48,7 @@ class SIPreEmitPeephole { const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); // Check if the machine instruction being processed is a supported packed - // instruction + // instruction. bool isUnpackingSupportedInstr(MachineInstr &MI) const; // Creates a list of packed instructions following an MFMA that are suitable // for unpacking. @@ -60,23 +60,23 @@ class SIPreEmitPeephole { // ==> // v_fma_f32 v0, v1, v3, v3 // v_fma_f32 v1, v0, v2, v2 - // here, we have overwritten v0 before we use it. This function checks if - // unpacking can lead to such a situation + // Here, we have overwritten v0 before we use it. This function checks if + // unpacking can lead to such a situation. bool canUnpackingIntroduceDependencies(const MachineInstr &MI); // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for // this transformation. void performF32Unpacking(MachineInstr &I); - // Select corresponding unpacked instruction from packed instruction as I + // Select corresponding unpacked instruction uint16_t mapToUnpackedOpcode(MachineInstr &I); // Creates the unpacked instruction to be inserted. Adds source modifiers to // the unpacked instructions based on the source modifiers in the packed - // instruction + // instruction. MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode, bool IsHiBits); - // process operands/source modifiers from packed instructions and insert the - // appropriate source modifers and operands into the unpacked instructions - void addOperandAndMods(MachineInstrBuilder NewMI, unsigned SrcMods, + // Process operands/source modifiers from packed instructions and insert the + // appropriate source modifers and operands into the unpacked instructions. + void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods, bool IsHiBits, const MachineOperand &SrcMO); public: @@ -455,7 +455,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, } // If support is extended to new operations, add tests in -// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); switch (Opcode) { @@ -481,48 +481,47 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( // Such scenarios can arise due to specific combinations of op_sel and // op_sel_hi modifiers. Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); - int Src0ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); - int Src1ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers); - unsigned Src0Mods = MI.getOperand(Src0ModifiersIdx).getImm(); - unsigned Src1Mods = MI.getOperand(Src1ModifiersIdx).getImm(); - - if (TII->getNamedOperand(MI, AMDGPU::OpName::src0)->isReg()) { - Register SrcReg0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)->getReg(); + unsigned Src0Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); + + const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0MO->isReg()) { + Register SrcReg0 = Src0MO->getReg(); Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) : TRI->getSubReg(SrcReg0, AMDGPU::sub0); // Check if the register selected by op_sel_hi is the same as the first - // register in the destination register pair - if (UnpackedDstReg == HiSrc0Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) + // register in the destination register pair. + if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg)) return true; } - if (TII->getNamedOperand(MI, AMDGPU::OpName::src1)->isReg()) { - Register SrcReg1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)->getReg(); + const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src1MO->isReg()) { + Register SrcReg1 = Src1MO->getReg(); Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) : TRI->getSubReg(SrcReg1, AMDGPU::sub0); - if (UnpackedDstReg == HiSrc1Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) + if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg)) return true; } - // Applicable for packed instructions with 3 source operands, such as V_PK_FMA + // Applicable for packed instructions with 3 source operands, such as + // V_PK_FMA. if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - int Src2ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); - unsigned Src2Mods = MI.getOperand(Src2ModifiersIdx).getImm(); - if (TII->getNamedOperand(MI, AMDGPU::OpName::src2)->isReg()) { + unsigned Src2Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); + const MachineOperand *Src2MO = + TII->getNamedOperand(MI, AMDGPU::OpName::src2); + if (Src2MO->isReg()) { Register SrcReg2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) : TRI->getSubReg(SrcReg2, AMDGPU::sub0); - if (UnpackedDstReg == HiSrc2Reg || - TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) + if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg)) return true; } } @@ -533,7 +532,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { unsigned Opcode = I.getOpcode(); // Use 64 bit encoding to allow use of VOP3 instructions. // VOP3 e64 instructions allow source modifiers - // e32 instructions don't allow source modifiers + // e32 instructions don't allow source modifiers. switch (Opcode) { case AMDGPU::V_PK_ADD_F32: return AMDGPU::V_ADD_F32_e64; @@ -547,7 +546,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) { llvm_unreachable("Fully covered switch"); } -void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, +void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods, bool IsHiBits, const MachineOperand &SrcMO) { unsigned NewSrcMods = 0; @@ -555,13 +554,13 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done // for ABS modifiers. - // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit - // lane. - // NEG_HI shares the same bit position with ABS. But packed instructions do - // not support ABS. Therefore, NEG_HI must be translated to NEG source - // modifier for the higher 32 bits. Unpacked VOP3 instructions support - // ABS, but do not support NEG_HI. Therefore we need to explicitly add the - // NEG modifier if present in the packed instruction + // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit + // lane. + // NEG_HI shares the same bit position with ABS. But packed instructions do + // not support ABS. Therefore, NEG_HI must be translated to NEG source + // modifier for the higher 32 bits. Unpacked VOP3 instructions support + // ABS, but do not support NEG_HI. Therefore we need to explicitly add the + // NEG modifier if present in the packed instruction. if (SrcMods & NegModifier) NewSrcMods |= SISrcMods::NEG; // Src modifiers. Only negative modifiers are added if needed. Unpacked @@ -572,7 +571,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder NewMI, NewMI.addImm(SrcMO.getImm()); return; } - // If op_sel == 0, select register 0 of reg:sub0_sub1 + // If op_sel == 0, select register 0 of reg:sub0_sub1. Register UnpackedSrcReg = (SrcMods & OpSelModifier) ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1) : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0); @@ -609,8 +608,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates( auto E = BB->end(); int TotalCyclesBetweenCandidates = 0; auto SchedModel = TII->getSchedModel(); - SetVector RegsOverlappedByMFMADef; - RegsOverlappedByMFMADef.insert(BeginMI.getOperand(0).getReg()); + Register MFMADef = BeginMI.getOperand(0).getReg(); + for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { MachineInstr &Instr = *I; if (Instr.isMetaInstruction()) @@ -619,6 +618,9 @@ void SIPreEmitPeephole::collectUnpackingCandidates( return; if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) return; + if (SIInstrInfo::modifiesModeRegister(Instr) && + Instr.modifiesRegister(AMDGPU::EXEC, TRI)) + return; const MCSchedClassDesc *InstrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); TotalCyclesBetweenCandidates += @@ -633,14 +635,8 @@ void SIPreEmitPeephole::collectUnpackingCandidates( // read/write registers. for (const MachineOperand &InstrMO : Instr.operands()) { if (InstrMO.isReg()) { - for (unsigned i = 0; i < RegsOverlappedByMFMADef.size(); ++i) { - if (TRI->regsOverlap(RegsOverlappedByMFMADef[i], InstrMO.getReg())) { - if (isUnpackingSupportedInstr(Instr)) - return; - RegsOverlappedByMFMADef.insert(Instr.getOperand(0).getReg()); - break; - } - } + if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) + return; } } if (isUnpackingSupportedInstr(Instr)) { @@ -649,14 +645,15 @@ void SIPreEmitPeephole::collectUnpackingCandidates( return; // If it is a packed instruction, we should subtract it's latency from the // overall latency calculation here, because the packed instruction will - // be removed and replaced by 2 unpacked instructions + // be removed and replaced by 2 unpacked instructions. TotalCyclesBetweenCandidates -= SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; // We're adding 2 to account for the extra latency added by unpacking into // 2 instructions. At the time of writing, the considered unpacked // instructions have latency of 1. - // TODO: improve latency handling of possible inserted instructions + // TODO: improve latency handling of possible inserted instructions. TotalCyclesBetweenCandidates += 2; + // Subtract 1 to account for MFMA issue latency. if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) InstrsToUnpack.insert(&Instr); } @@ -711,15 +708,11 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1) : TRI->getSubReg(DstReg, AMDGPU::sub0); - int ClampIdx = AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::clamp); - int64_t ClampVal = I.getOperand(ClampIdx).getImm(); - int Src0ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src0_modifiers); - int Src1ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src1_modifiers); - - unsigned Src0Mods = I.getOperand(Src0ModifiersIdx).getImm(); - unsigned Src1Mods = I.getOperand(Src1ModifiersIdx).getImm(); + int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm(); + unsigned Src0Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm(); + unsigned Src1Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm(); MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); NewMI.addDef(UnpackedDstReg); // vdst @@ -729,9 +722,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I, if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { const MachineOperand *SrcMO3 = TII->getNamedOperand(I, AMDGPU::OpName::src2); - int Src2ModifiersIdx = - AMDGPU::getNamedOperandIdx(OpCode, AMDGPU::OpName::src2_modifiers); - unsigned Src2Mods = I.getOperand(Src2ModifiersIdx).getImm(); + unsigned Src2Mods = + TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm(); addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3); } NewMI.addImm(ClampVal); // clamp @@ -785,7 +777,6 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { // and limit the distance to 20 instructions for compile time purposes. // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions // may be bundled with the instructions they modify. - // for (auto &MI : make_early_inc_range(MBB.instrs())) { if (Count == Threshold) SetGPRMI = nullptr; @@ -808,7 +799,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { } } - // TODO: fold this into previous block, if possible. Evaluate and handle any + // TODO: Fold this into previous block, if possible. Evaluate and handle any // side effects. for (MachineBasicBlock &MBB : MF) { // Unpack packed instructions overlapped by MFMAs. This allows the compiler @@ -827,9 +818,7 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { } if (!InstrsToUnpack.empty()) { for (MachineInstr *MI : InstrsToUnpack) { - if (!SIInstrInfo::modifiesModeRegister(*MI) && - !MI->modifiesRegister(AMDGPU::EXEC, TRI)) - performF32Unpacking(*MI); + performF32Unpacking(*MI); } } } From 022fa2d09a5aba6b96e33b191ea1e4d38dce2bbd Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 18 Sep 2025 10:20:48 -0500 Subject: [PATCH 13/17] new tests, reduced nesting, cleanup --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 108 ++++++++---------- ...ck-non-coissue-insts-post-ra-scheduler.mir | 86 ++++++++++++++ 2 files changed, 136 insertions(+), 58 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index b2f7780e1e2f9..71f8721efec04 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -62,7 +62,7 @@ class SIPreEmitPeephole { // v_fma_f32 v1, v0, v2, v2 // Here, we have overwritten v0 before we use it. This function checks if // unpacking can lead to such a situation. - bool canUnpackingIntroduceDependencies(const MachineInstr &MI); + bool canUnpackingClobberRegister(const MachineInstr &MI); // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for // this transformation. @@ -469,7 +469,7 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { llvm_unreachable("Fully covered switch"); } -bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( +bool SIPreEmitPeephole::canUnpackingClobberRegister( const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); @@ -481,14 +481,12 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( // Such scenarios can arise due to specific combinations of op_sel and // op_sel_hi modifiers. Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0); - unsigned Src0Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); - unsigned Src1Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0MO->isReg()) { + if (Src0MO && Src0MO->isReg()) { Register SrcReg0 = Src0MO->getReg(); + unsigned Src0Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm(); Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg0, AMDGPU::sub1) : TRI->getSubReg(SrcReg0, AMDGPU::sub0); @@ -499,8 +497,10 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( } const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - if (Src1MO->isReg()) { + if (Src1MO && Src1MO->isReg()) { Register SrcReg1 = Src1MO->getReg(); + unsigned Src1Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm(); Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg1, AMDGPU::sub1) : TRI->getSubReg(SrcReg1, AMDGPU::sub0); @@ -511,13 +511,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies( // Applicable for packed instructions with 3 source operands, such as // V_PK_FMA. if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) { - unsigned Src2Mods = - TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); const MachineOperand *Src2MO = TII->getNamedOperand(MI, AMDGPU::OpName::src2); - if (Src2MO->isReg()) { + if (Src2MO && Src2MO->isReg()) { Register SrcReg2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); + unsigned Src2Mods = + TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) ? TRI->getSubReg(SrcReg2, AMDGPU::sub1) : TRI->getSubReg(SrcReg2, AMDGPU::sub0); @@ -614,19 +614,19 @@ void SIPreEmitPeephole::collectUnpackingCandidates( MachineInstr &Instr = *I; if (Instr.isMetaInstruction()) continue; - if (Instr.isTerminator()) - return; - if (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) - return; - if (SIInstrInfo::modifiesModeRegister(Instr) && - Instr.modifiesRegister(AMDGPU::EXEC, TRI)) + if ((Instr.isTerminator()) || + (TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) || + (SIInstrInfo::modifiesModeRegister(Instr) && + Instr.modifiesRegister(AMDGPU::EXEC, TRI))) return; + const MCSchedClassDesc *InstrSchedClassDesc = SchedModel.resolveSchedClass(&Instr); - TotalCyclesBetweenCandidates += + uint16_t Latency = SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; + TotalCyclesBetweenCandidates += Latency; - if (TotalCyclesBetweenCandidates > NumMFMACycles) + if (TotalCyclesBetweenCandidates > NumMFMACycles - 1) return; // Identify register dependencies between those used by the MFMA // instruction and the following packed instructions. Also checks for @@ -634,29 +634,26 @@ void SIPreEmitPeephole::collectUnpackingCandidates( // def and uses. Conservatively ensures that we do not incorrectly // read/write registers. for (const MachineOperand &InstrMO : Instr.operands()) { - if (InstrMO.isReg()) { - if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) - return; - } - } - if (isUnpackingSupportedInstr(Instr)) { - assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued."); - if (canUnpackingIntroduceDependencies(Instr)) + if (!InstrMO.isReg() || !InstrMO.getReg().isValid()) + continue; + if (TRI->regsOverlap(MFMADef, InstrMO.getReg())) return; - // If it is a packed instruction, we should subtract it's latency from the - // overall latency calculation here, because the packed instruction will - // be removed and replaced by 2 unpacked instructions. - TotalCyclesBetweenCandidates -= - SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; - // We're adding 2 to account for the extra latency added by unpacking into - // 2 instructions. At the time of writing, the considered unpacked - // instructions have latency of 1. - // TODO: improve latency handling of possible inserted instructions. - TotalCyclesBetweenCandidates += 2; - // Subtract 1 to account for MFMA issue latency. - if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) - InstrsToUnpack.insert(&Instr); } + if (!isUnpackingSupportedInstr(Instr)) + continue; + + assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued."); + if (canUnpackingClobberRegister(Instr)) + return; + // If it's a packed instruction, adjust latency: remove the packed + // latency, add latency of two unpacked instructions (currently estimated + // as 2 cycles). + TotalCyclesBetweenCandidates -= Latency; + // TODO: improve latency handling based on instruction modeling. + TotalCyclesBetweenCandidates += 2; + // Subtract 1 to account for MFMA issue latency. + if (TotalCyclesBetweenCandidates < NumMFMACycles - 1) + InstrsToUnpack.insert(&Instr); } return; } @@ -672,8 +669,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); MachineOperand LoDstOp = Op0LOp1L->getOperand(0); - if (DstOp.isUndef()) - LoDstOp.setIsUndef(); + LoDstOp.setIsUndef(DstOp.isUndef()); MachineInstrBuilder Op0HOp1H = createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true); @@ -687,10 +683,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { Op0LOp1L->setFlag(MachineInstr::MIFlag::FmContract); Op0HOp1H->setFlag(MachineInstr::MIFlag::FmContract); } - if (DstOp.getReg().isPhysical() && DstOp.isRenamable()) { - LoDstOp.setIsRenamable(true); - HiDstOp.setIsRenamable(true); - } + + LoDstOp.setIsRenamable(DstOp.isRenamable()); + HiDstOp.setIsRenamable(DstOp.isRenamable()); I.eraseFromParent(); return; @@ -804,22 +799,19 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) { for (MachineBasicBlock &MBB : MF) { // Unpack packed instructions overlapped by MFMAs. This allows the compiler // to co-issue unpacked instructions with MFMA - uint16_t NumMFMACycles = 0; auto SchedModel = TII->getSchedModel(); SetVector InstrsToUnpack; for (auto &MI : make_early_inc_range(MBB.instrs())) { - if (SIInstrInfo::isMFMA(MI)) { - const MCSchedClassDesc *SchedClassDesc = - SchedModel.resolveSchedClass(&MI); - NumMFMACycles = - SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; - collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); - } + if (!SIInstrInfo::isMFMA(MI)) + continue; + const MCSchedClassDesc *SchedClassDesc = + SchedModel.resolveSchedClass(&MI); + uint16_t NumMFMACycles = + SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; + collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles); } - if (!InstrsToUnpack.empty()) { - for (MachineInstr *MI : InstrsToUnpack) { - performF32Unpacking(*MI); - } + for (MachineInstr *MI : InstrsToUnpack) { + performF32Unpacking(*MI); } } diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 6a9b406e390c0..054ca918fdb0f 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -481,3 +481,89 @@ body: | $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 + +... +--- +name: test_mfma_def_using_instr_blocks_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_unpacking_with_imm_input +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 + +... +--- +name: test_neg_lo_hi_post_unpacking +tracksRegLiveness: true + +liveins: + - { reg: '$sgpr4_sgpr5' } + +body: | + bb.0.entry: + liveins: $sgpr4_sgpr5 + early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + S_WAITCNT 49279 + renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + S_WAITCNT 49279 + $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + S_ENDPGM 0 \ No newline at end of file From 720468f8cd1dfd6a1ee6b782a0f5d3a4ad6f7b29 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 18 Sep 2025 12:29:48 -0500 Subject: [PATCH 14/17] update tests, update unpacking selection condition --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 3 +- ...ck-non-coissue-insts-post-ra-scheduler.mir | 1014 +++++++++++++---- 2 files changed, 811 insertions(+), 206 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 71f8721efec04..fb89248cb5919 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -458,6 +458,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); + if (!TII->isNeverCoissue(MI)) + return false; switch (Opcode) { case AMDGPU::V_PK_ADD_F32: case AMDGPU::V_PK_MUL_F32: @@ -642,7 +644,6 @@ void SIPreEmitPeephole::collectUnpackingCandidates( if (!isUnpackingSupportedInstr(Instr)) continue; - assert(TII->isNeverCoissue(Instr) && "Instruction cannot be co-issued."); if (canUnpackingClobberRegister(Instr)) return; // If it's a packed instruction, adjust latency: remove the packed diff --git a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir index 054ca918fdb0f..8b467eb0b054e 100644 --- a/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir +++ b/llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir @@ -1,5 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx950 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX950 %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX942 %s +# RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GFX90a %s --- name: test_pk_mul_unpacking_f32 @@ -11,27 +13,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_pk_mul_unpacking_f32 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_mul_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -61,27 +106,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_op_sel_selection_unpacking_f32 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_op_sel_selection_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -111,27 +199,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_op_sel_hi_selection_unpacking_f32 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: S_WAITCNT 49279 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_op_sel_hi_selection_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -161,44 +292,119 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_pk_add_unpacking_f32 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec - ; GCN-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec - ; GCN-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec - ; GCN-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec - ; GCN-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec - ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_pk_add_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX950-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX950-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX950-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX950-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX950-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX950-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_add_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX942-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX942-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX942-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX942-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX942-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX942-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr2 = nofpexcept V_ADD_F32_e64 0, killed $sgpr2, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr3 = nofpexcept V_ADD_F32_e64 0, killed $sgpr3, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr6 = nofpexcept V_ADD_F32_e64 0, killed $sgpr6, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr7 = nofpexcept V_ADD_F32_e64 0, killed $sgpr7, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = nofpexcept V_ADD_F32_e64 0, killed $sgpr4, 0, $vgpr0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr5 = nofpexcept V_ADD_F32_e64 0, killed $sgpr5, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_add_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec + ; GFX90a-NEXT: renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec + ; GFX90a-NEXT: renamable $vgpr2 = V_MOV_B32_e32 4, implicit $exec + ; GFX90a-NEXT: renamable $vgpr3 = V_MOV_B32_e32 3, implicit $exec + ; GFX90a-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: renamable $vgpr16 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: $agpr31 = V_ACCVGPR_WRITE_B32_e64 $sgpr15, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr30 = V_ACCVGPR_WRITE_B32_e64 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr29 = V_ACCVGPR_WRITE_B32_e64 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr28 = V_ACCVGPR_WRITE_B32_e64 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr27 = V_ACCVGPR_WRITE_B32_e64 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr26 = V_ACCVGPR_WRITE_B32_e64 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr25 = V_ACCVGPR_WRITE_B32_e64 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr24 = V_ACCVGPR_WRITE_B32_e64 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr23 = V_ACCVGPR_WRITE_B32_e64 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr22 = V_ACCVGPR_WRITE_B32_e64 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr21 = V_ACCVGPR_WRITE_B32_e64 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr20 = V_ACCVGPR_WRITE_B32_e64 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr19 = V_ACCVGPR_WRITE_B32_e64 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr18 = V_ACCVGPR_WRITE_B32_e64 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr17 = V_ACCVGPR_WRITE_B32_e64 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90a-NEXT: $agpr16 = V_ACCVGPR_WRITE_B32_e64 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec + ; GFX90a-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr18, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr1 = V_MOV_B32_e32 killed $sgpr19, implicit $exec, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_BF8_BF8_e64 killed $vgpr4_vgpr5, killed $vgpr2_vgpr3, killed $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31, 1, 2, 3, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr2_vgpr3 = nofpexcept V_PK_ADD_F32 8, killed $sgpr2_sgpr3, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr6_vgpr7 = nofpexcept V_PK_ADD_F32 8, killed $sgpr6_sgpr7, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_ADD_F32 8, killed $sgpr4_sgpr5, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr10_vgpr11 = nofpexcept V_PK_ADD_F32 8, killed $sgpr10_sgpr11, 8, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr4 = V_MOV_B32_e32 2, implicit $exec renamable $vgpr5 = V_MOV_B32_e32 1, implicit $exec @@ -244,25 +450,64 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_pk_fma_unpacking_f32 - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, killed $vgpr4, 0, $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_pk_fma_unpacking_f32 + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -290,26 +535,67 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_unpacking_does_not_introduce_rw_dependency - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr4 = nofpexcept V_MUL_F32_e64 0, $sgpr30, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr5 = nofpexcept V_MUL_F32_e64 0, $sgpr31, 0, $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_unpacking_does_not_introduce_rw_dependency + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_MUL_F32 8, $sgpr30_sgpr31, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: renamable $vgpr4_vgpr5 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, $vgpr4_vgpr5, 0, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -338,28 +624,74 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec - ; GCN-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_opcodes_not_supported_for_unpacking_are_skipped + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_dpp $vgpr4, $vgpr4, 228, 15, 15, -1, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_CVT_PK_BF8_F32_e64 0, killed $vgpr4, 0, $vgpr4, $vgpr5, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: $vgpr8_vgpr9 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: $vgpr10_vgpr11 = V_PK_MOV_B32 12, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 0, 0, 0, 0, 0, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -394,27 +726,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_opsel_register_is_correctly_marked_as_killed - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec - ; GCN-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_FMA_F32_e64 0, $sgpr30, 0, $vgpr5, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_FMA_F32_e64 0, killed $sgpr30, 0, killed $vgpr5, 0, killed $vgpr7, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_opsel_register_is_correctly_marked_as_killed + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr7 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -444,26 +819,68 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 - ; GCN-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked - ; GCN: liveins: $sgpr4_sgpr5 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 - ; GCN-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec - ; GCN-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 - ; GCN-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 - ; GCN-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 - ; GCN-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec - ; GCN-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 - ; GCN-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec - ; GCN-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec - ; GCN-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec - ; GCN-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec - ; GCN-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec - ; GCN-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - ; GCN-NEXT: S_ENDPGM 0 + ; GFX950-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX950-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX942-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_inst_dependent_on_mfma_are_not_unpacked + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90a-NEXT: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 0, killed $sgpr30_sgpr31, 12, killed $vgpr4_vgpr5, 8, killed $vgpr6_vgpr7, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -493,6 +910,65 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX950-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX942-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_mfma_def_using_instr_blocks_unpacking + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_FMA_F32 8, killed $sgpr30_sgpr31, 8, killed $vgpr4_vgpr5, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 @@ -521,6 +997,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_unpacking_with_imm_input + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_unpacking_with_imm_input + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, 1065353216, 0, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_unpacking_with_imm_input + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, 1065353216, 8, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -550,6 +1090,70 @@ liveins: body: | bb.0.entry: liveins: $sgpr4_sgpr5 + ; GFX950-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX950: liveins: $sgpr4_sgpr5 + ; GFX950-NEXT: {{ $}} + ; GFX950-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX950-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX950-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX950-NEXT: S_WAITCNT 49279 + ; GFX950-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX950-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX950-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX950-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX950-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX950-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX950-NEXT: S_ENDPGM 0 + ; + ; GFX942-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX942: liveins: $sgpr4_sgpr5 + ; GFX942-NEXT: {{ $}} + ; GFX942-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX942-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX942-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX942-NEXT: S_WAITCNT 49279 + ; GFX942-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX942-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX942-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX942-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX942-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX942-NEXT: $vgpr16 = nofpexcept V_MUL_F32_e64 0, killed $sgpr30, 1, killed $vgpr4, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: $vgpr17 = nofpexcept V_MUL_F32_e64 0, killed $sgpr31, 1, killed $vgpr5, 0, 0, implicit $mode, implicit $exec + ; GFX942-NEXT: S_ENDPGM 0 + ; + ; GFX90a-LABEL: name: test_neg_lo_hi_post_unpacking + ; GFX90a: liveins: $sgpr4_sgpr5 + ; GFX90a-NEXT: {{ $}} + ; GFX90a-NEXT: early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 + ; GFX90a-NEXT: renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47 = S_LOAD_DWORDX4_IMM renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: renamable $sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX4_IMM renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr42_sgpr43, 0, 0 + ; GFX90a-NEXT: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX16_IMM_ec killed renamable $sgpr40_sgpr41, 0, 0 + ; GFX90a-NEXT: S_WAITCNT 49279 + ; GFX90a-NEXT: $vgpr0_vgpr1 = V_MOV_B64_e32 $sgpr44_sgpr45, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr44_sgpr45_sgpr46_sgpr47 + ; GFX90a-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e32 killed $sgpr46_sgpr47, implicit $exec, implicit $sgpr44_sgpr45_sgpr46_sgpr47, implicit $exec + ; GFX90a-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e32 $sgpr48_sgpr49, implicit $exec, implicit-def $vgpr4_vgpr5_vgpr6_vgpr7, implicit $sgpr48_sgpr49_sgpr50_sgpr51 + ; GFX90a-NEXT: $vgpr6_vgpr7 = V_MOV_B64_e32 killed $sgpr50_sgpr51, implicit $exec, implicit $sgpr48_sgpr49_sgpr50_sgpr51, implicit $exec + ; GFX90a-NEXT: early-clobber renamable $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X16_F16_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec + ; GFX90a-NEXT: $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec + ; GFX90a-NEXT: renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec + ; GFX90a-NEXT: S_ENDPGM 0 early-clobber renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM_ec killed renamable $sgpr4_sgpr5, 0, 0 renamable $vgpr18 = V_MOV_B32_e32 0, implicit $exec S_WAITCNT 49279 @@ -566,4 +1170,4 @@ body: | $vgpr4 = V_MOV_B32_e32 killed $sgpr14, implicit $exec, implicit $exec $vgpr5 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit $exec renamable $vgpr16_vgpr17 = nofpexcept V_PK_MUL_F32 8, killed $sgpr30_sgpr31, 11, killed $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $mode, implicit $exec - S_ENDPGM 0 \ No newline at end of file + S_ENDPGM 0 From 4685911d43555f9629091ed294e9c0dce78f6f60 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 18 Sep 2025 15:09:52 -0500 Subject: [PATCH 15/17] minor fixes --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index fb89248cb5919..e71f5ed1cdaa0 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -457,9 +457,9 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, // If support is extended to new operations, add tests in // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir. bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { - unsigned Opcode = MI.getOpcode(); if (!TII->isNeverCoissue(MI)) return false; + unsigned Opcode = MI.getOpcode(); switch (Opcode) { case AMDGPU::V_PK_ADD_F32: case AMDGPU::V_PK_MUL_F32: @@ -516,8 +516,7 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister( const MachineOperand *Src2MO = TII->getNamedOperand(MI, AMDGPU::OpName::src2); if (Src2MO && Src2MO->isReg()) { - Register SrcReg2 = - TII->getNamedOperand(MI, AMDGPU::OpName::src2)->getReg(); + Register SrcReg2 = Src2MO->getReg(); unsigned Src2Mods = TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm(); Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1) @@ -628,7 +627,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates( SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; TotalCyclesBetweenCandidates += Latency; - if (TotalCyclesBetweenCandidates > NumMFMACycles - 1) + if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1) return; // Identify register dependencies between those used by the MFMA // instruction and the following packed instructions. Also checks for @@ -663,8 +662,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - if (UnpackedOpcode == std::numeric_limits::max()) - return; + assert(UnpackedOpcode != std::numeric_limits::max() && "Unsupported Opcode"); MachineInstrBuilder Op0LOp1L = createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false); From 3e016ca3ee60cb6fffccffc519be885412d65d5a Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 18 Sep 2025 16:12:38 -0500 Subject: [PATCH 16/17] update failing test --- .../AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir index aad6e031aa9ed..b07dec326327e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.exp.large.mir @@ -536,9 +536,12 @@ ; GCN-NEXT: v_fma_f32 v59, s4, v59, -v134 ; GCN-NEXT: v_pk_mul_f32 v[80:81], v[80:81], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[64:79], v[140:141], v[160:161], v[64:79] - ; GCN-NEXT: v_pk_mul_f32 v[82:83], v[82:83], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[84:85], v[84:85], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[86:87], v[86:87], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v82, v82, v48 + ; GCN-NEXT: v_mul_f32_e64 v83, v83, v48 + ; GCN-NEXT: v_mul_f32_e64 v84, v84, v48 + ; GCN-NEXT: v_mul_f32_e64 v85, v85, v48 + ; GCN-NEXT: v_mul_f32_e64 v86, v86, v48 + ; GCN-NEXT: v_mul_f32_e64 v87, v87, v48 ; GCN-NEXT: v_pk_mul_f32 v[88:89], v[88:89], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[90:91], v[90:91], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[92:93], v[92:93], v[48:49] op_sel_hi:[1,0] @@ -547,9 +550,12 @@ ; GCN-NEXT: v_exp_f32_e32 v58, v58 ; GCN-NEXT: v_pk_mul_f32 v[96:97], v[96:97], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[80:95], v[144:145], v[160:161], v[80:95] - ; GCN-NEXT: v_pk_mul_f32 v[98:99], v[98:99], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[100:101], v[100:101], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[102:103], v[102:103], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v98, v98, v48 + ; GCN-NEXT: v_mul_f32_e64 v99, v99, v48 + ; GCN-NEXT: v_mul_f32_e64 v100, v100, v48 + ; GCN-NEXT: v_mul_f32_e64 v101, v101, v48 + ; GCN-NEXT: v_mul_f32_e64 v102, v102, v48 + ; GCN-NEXT: v_mul_f32_e64 v103, v103, v48 ; GCN-NEXT: v_pk_mul_f32 v[104:105], v[104:105], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[106:107], v[106:107], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[108:109], v[108:109], v[48:49] op_sel_hi:[1,0] @@ -561,8 +567,10 @@ ; GCN-NEXT: v_exp_f32_e32 v59, v57 ; GCN-NEXT: v_mfma_f32_32x32x8_f16 v[96:111], v[148:149], v[160:161], v[96:111] ; GCN-NEXT: v_fma_f32 v60, s4, v60, -v134 - ; GCN-NEXT: v_pk_mul_f32 v[112:113], v[112:113], v[48:49] op_sel_hi:[1,0] - ; GCN-NEXT: v_pk_mul_f32 v[114:115], v[114:115], v[48:49] op_sel_hi:[1,0] + ; GCN-NEXT: v_mul_f32_e64 v112, v112, v48 + ; GCN-NEXT: v_mul_f32_e64 v113, v113, v48 + ; GCN-NEXT: v_mul_f32_e64 v114, v114, v48 + ; GCN-NEXT: v_mul_f32_e64 v115, v115, v48 ; GCN-NEXT: v_pk_mul_f32 v[116:117], v[116:117], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[118:119], v[118:119], v[48:49] op_sel_hi:[1,0] ; GCN-NEXT: v_pk_mul_f32 v[120:121], v[120:121], v[48:49] op_sel_hi:[1,0] From 1b2bf711eb920b78c24e31a6c958751d0bdd0007 Mon Sep 17 00:00:00 2001 From: Akash Dutta Date: Thu, 18 Sep 2025 17:01:54 -0500 Subject: [PATCH 17/17] clang-formatted --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index e71f5ed1cdaa0..4d3331ab353d3 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -471,8 +471,7 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const { llvm_unreachable("Fully covered switch"); } -bool SIPreEmitPeephole::canUnpackingClobberRegister( - const MachineInstr &MI) { +bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) { unsigned OpCode = MI.getOpcode(); Register DstReg = MI.getOperand(0).getReg(); // Only the first register in the register pair needs to be checked due to the @@ -662,7 +661,8 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) { MachineOperand DstOp = I.getOperand(0); uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); - assert(UnpackedOpcode != std::numeric_limits::max() && "Unsupported Opcode"); + assert(UnpackedOpcode != std::numeric_limits::max() && + "Unsupported Opcode"); MachineInstrBuilder Op0LOp1L = createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);