-
Couldn't load subscription status.
- Fork 15k
[AMDGPU] Use reverse iteration in CodeGenPrepare #145484
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,6 +15,7 @@ | |
| #include "AMDGPU.h" | ||
| #include "AMDGPUTargetMachine.h" | ||
| #include "SIModeRegisterDefaults.h" | ||
| #include "llvm/ADT/SetVector.h" | ||
| #include "llvm/Analysis/AssumptionCache.h" | ||
| #include "llvm/Analysis/ConstantFolding.h" | ||
| #include "llvm/Analysis/TargetLibraryInfo.h" | ||
|
|
@@ -27,6 +28,7 @@ | |
| #include "llvm/IR/InstVisitor.h" | ||
| #include "llvm/IR/IntrinsicsAMDGPU.h" | ||
| #include "llvm/IR/PatternMatch.h" | ||
| #include "llvm/IR/ValueHandle.h" | ||
| #include "llvm/InitializePasses.h" | ||
| #include "llvm/Pass.h" | ||
| #include "llvm/Support/KnownBits.h" | ||
|
|
@@ -106,6 +108,7 @@ class AMDGPUCodeGenPrepareImpl | |
| bool FlowChanged = false; | ||
| mutable Function *SqrtF32 = nullptr; | ||
| mutable Function *LdexpF32 = nullptr; | ||
| mutable SmallVector<WeakVH> DeadVals; | ||
|
|
||
| DenseMap<const PHINode *, bool> BreakPhiNodesCache; | ||
|
|
||
|
|
@@ -242,6 +245,8 @@ class AMDGPUCodeGenPrepareImpl | |
| Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src, | ||
| FastMathFlags FMF) const; | ||
|
|
||
| bool tryNarrowMathIfNoOverflow(Instruction *I); | ||
|
|
||
| public: | ||
| bool visitFDiv(BinaryOperator &I); | ||
|
|
||
|
|
@@ -281,28 +286,21 @@ bool AMDGPUCodeGenPrepareImpl::run() { | |
| BreakPhiNodesCache.clear(); | ||
| bool MadeChange = false; | ||
|
|
||
| Function::iterator NextBB; | ||
| for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) { | ||
| BasicBlock *BB = &*FI; | ||
| NextBB = std::next(FI); | ||
|
|
||
| BasicBlock::iterator Next; | ||
| for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; | ||
| I = Next) { | ||
| Next = std::next(I); | ||
|
|
||
| MadeChange |= visit(*I); | ||
|
|
||
| if (Next != E) { // Control flow changed | ||
| BasicBlock *NextInstBB = Next->getParent(); | ||
| if (NextInstBB != BB) { | ||
| BB = NextInstBB; | ||
| E = BB->end(); | ||
| FE = F.end(); | ||
| } | ||
| } | ||
| // Need to use make_early_inc_range because integer division expansion is | ||
| // handled by Transform/Utils, and it can delete instructions such as the | ||
| // terminator of the BB. | ||
| for (BasicBlock &BB : reverse(F)) { | ||
| for (Instruction &I : make_early_inc_range(reverse(BB))) { | ||
| if (!isInstructionTriviallyDead(&I, TLI)) | ||
| MadeChange |= visit(I); | ||
| } | ||
| } | ||
|
|
||
| while (!DeadVals.empty()) { | ||
| if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val())) | ||
| RecursivelyDeleteTriviallyDeadInstructions(I, TLI); | ||
| } | ||
|
|
||
| return MadeChange; | ||
| } | ||
|
|
||
|
|
@@ -422,7 +420,7 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const { | |
| Value *NewVal = insertValues(Builder, Ty, ResultVals); | ||
| NewVal->takeName(&I); | ||
| I.replaceAllUsesWith(NewVal); | ||
| I.eraseFromParent(); | ||
| DeadVals.push_back(&I); | ||
|
|
||
| return true; | ||
| } | ||
|
|
@@ -496,10 +494,10 @@ bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const { | |
| FoldedT, FoldedF); | ||
| NewSelect->takeName(&BO); | ||
| BO.replaceAllUsesWith(NewSelect); | ||
| BO.eraseFromParent(); | ||
| DeadVals.push_back(&BO); | ||
| if (CastOp) | ||
| CastOp->eraseFromParent(); | ||
| Sel->eraseFromParent(); | ||
| DeadVals.push_back(CastOp); | ||
| DeadVals.push_back(Sel); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -895,7 +893,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) { | |
| if (NewVal) { | ||
| FDiv.replaceAllUsesWith(NewVal); | ||
| NewVal->takeName(&FDiv); | ||
| RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLI); | ||
| DeadVals.push_back(&FDiv); | ||
| } | ||
|
|
||
| return true; | ||
|
|
@@ -1302,10 +1300,7 @@ it will create `s_and_b32 s0, s0, 0xff`. | |
| We accept this change since the non-byte load assumes the upper bits | ||
| within the byte are all 0. | ||
| */ | ||
| static bool tryNarrowMathIfNoOverflow(Instruction *I, | ||
| const SITargetLowering *TLI, | ||
| const TargetTransformInfo &TTI, | ||
| const DataLayout &DL) { | ||
| bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) { | ||
| unsigned Opc = I->getOpcode(); | ||
| Type *OldType = I->getType(); | ||
|
|
||
|
|
@@ -1330,6 +1325,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, | |
| NewType = I->getType()->getWithNewBitWidth(NewBit); | ||
|
|
||
| // Old cost | ||
| const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F); | ||
| InstructionCost OldCost = | ||
| TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput); | ||
| // New cost of new op | ||
|
|
@@ -1360,7 +1356,7 @@ static bool tryNarrowMathIfNoOverflow(Instruction *I, | |
|
|
||
| Value *Zext = Builder.CreateZExt(Arith, OldType); | ||
| I->replaceAllUsesWith(Zext); | ||
| I->eraseFromParent(); | ||
| DeadVals.push_back(I); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -1370,8 +1366,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { | |
|
|
||
| if (UseMul24Intrin && replaceMulWithMul24(I)) | ||
| return true; | ||
| if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(), | ||
| TM.getTargetTransformInfo(F), DL)) | ||
| if (tryNarrowMathIfNoOverflow(&I)) | ||
| return true; | ||
|
|
||
| bool Changed = false; | ||
|
|
@@ -1436,7 +1431,7 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) { | |
|
|
||
| if (NewDiv) { | ||
| I.replaceAllUsesWith(NewDiv); | ||
| I.eraseFromParent(); | ||
| DeadVals.push_back(&I); | ||
| Changed = true; | ||
| } | ||
| } | ||
|
|
@@ -1492,7 +1487,7 @@ bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) { | |
| Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy); | ||
| Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType()); | ||
| I.replaceAllUsesWith(ValOrig); | ||
| I.eraseFromParent(); | ||
| DeadVals.push_back(&I); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -1534,7 +1529,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) { | |
|
|
||
| Fract->takeName(&I); | ||
| I.replaceAllUsesWith(Fract); | ||
| RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); | ||
| DeadVals.push_back(&I); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -1822,7 +1817,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) { | |
| } | ||
|
|
||
| I.replaceAllUsesWith(Vec); | ||
| I.eraseFromParent(); | ||
| DeadVals.push_back(&I); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -1903,7 +1898,7 @@ bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) { | |
| auto *Intrin = B.CreateIntrinsic( | ||
| I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)}); | ||
| I.replaceAllUsesWith(Intrin); | ||
| I.eraseFromParent(); | ||
| DeadVals.push_back(&I); | ||
| return true; | ||
| } | ||
|
|
||
|
|
@@ -2000,16 +1995,10 @@ bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) { | |
| Value *Fract = applyFractPat(Builder, FractArg); | ||
| Fract->takeName(&I); | ||
| I.replaceAllUsesWith(Fract); | ||
|
|
||
| RecursivelyDeleteTriviallyDeadInstructions(&I, TLI); | ||
| DeadVals.push_back(&I); | ||
| return true; | ||
| } | ||
|
|
||
| static bool isOneOrNegOne(const Value *Val) { | ||
| const APFloat *C; | ||
| return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0; | ||
| } | ||
|
|
||
| // Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way. | ||
| bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { | ||
| Type *Ty = Sqrt.getType()->getScalarType(); | ||
|
|
@@ -2030,18 +2019,6 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { | |
| if (ReqdAccuracy < 1.0f) | ||
| return false; | ||
|
|
||
| // FIXME: This is an ugly hack for this pass using forward iteration instead | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @arsenm Is this the right way to fix this hack ? |
||
| // of reverse. If it worked like a normal combiner, the rsq would form before | ||
| // we saw a sqrt call. | ||
| auto *FDiv = | ||
| dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser()); | ||
| if (FDiv && FDiv->getOpcode() == Instruction::FDiv && | ||
| FDiv->getFPAccuracy() >= 1.0f && | ||
| canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) && | ||
| // TODO: We should also handle the arcp case for the fdiv with non-1 value | ||
| isOneOrNegOne(FDiv->getOperand(0))) | ||
| return false; | ||
|
|
||
| Value *SrcVal = Sqrt.getOperand(0); | ||
| bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt); | ||
|
|
||
|
|
@@ -2065,7 +2042,7 @@ bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) { | |
| Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals); | ||
| NewSqrt->takeName(&Sqrt); | ||
| Sqrt.replaceAllUsesWith(NewSqrt); | ||
| Sqrt.eraseFromParent(); | ||
| DeadVals.push_back(&Sqrt); | ||
| return true; | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you still need
make_early_inc_range?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I added a comment to explain why. Some external helpers still delete instructions for us.