diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h index 6aa3a8b9b6e0b..abcc093dd88ab 100644 --- a/llvm/include/llvm/ADT/GenericSSAContext.h +++ b/llvm/include/llvm/ADT/GenericSSAContext.h @@ -54,7 +54,7 @@ template class GenericSSAContext { // The null value for ValueRefT. For LLVM IR and MIR, this is simply the // default constructed value. - static constexpr ValueRefT *ValueRefNull = {}; + static constexpr ValueRefT ValueRefNull = {}; // An InstructionT usually defines one or more ValueT objects. using InstructionT = typename SSATraits::InstructionT; @@ -96,6 +96,10 @@ template class GenericSSAContext { static bool isConstantOrUndefValuePhi(const InstructionT &Instr); const BlockT *getDefBlock(ConstValueRefT value) const; + void getPhiInputs(const InstructionT &Instr, + SmallVectorImpl &Values, + SmallVectorImpl &Blocks) const; + Printable print(const BlockT *block) const; Printable printAsOperand(const BlockT *BB) const; Printable print(const InstructionT *inst) const; diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h index 3b9b7f2633771..888dc074e054c 100644 --- a/llvm/include/llvm/ADT/GenericUniformityImpl.h +++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h @@ -455,9 +455,11 @@ template class GenericUniformityAnalysisImpl { /// the worklist. void taintAndPushAllDefs(const BlockT &JoinBlock); - /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on - /// the worklist. - void taintAndPushPhiNodes(const BlockT &JoinBlock); + /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on + /// the worklist if they are divergent over the path from \p JoinBlock + /// to \p DivTermBlock. + void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock, + const DivergenceDescriptorT &DivDesc); /// \brief Identify all Instructions that become divergent because \p DivExit /// is a divergent cycle exit of \p DivCycle. Mark those instructions as @@ -917,19 +919,70 @@ void GenericUniformityAnalysisImpl::taintAndPushAllDefs( /// Mark divergent phi nodes in a join block template void GenericUniformityAnalysisImpl::taintAndPushPhiNodes( - const BlockT &JoinBlock) { + const BlockT &JoinBlock, const BlockT &DivTermBlock, + const DivergenceDescriptorT &DivDesc) { LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock) << "\n"); for (const auto &Phi : JoinBlock.phis()) { - // FIXME: The non-undef value is not constant per se; it just happens to be - // uniform and may not dominate this PHI. So assuming that the same value - // reaches along all incoming edges may itself be undefined behaviour. This - // particular interpretation of the undef value was added to - // DivergenceAnalysis in the following review: - // - // https://reviews.llvm.org/D19013 - if (ContextT::isConstantOrUndefValuePhi(Phi)) + // Attempt to maintain uniformity for PHIs by considering control + // dependencies before marking them. + SmallVector Values; + SmallVector Blocks; + Context.getPhiInputs(Phi, Values, Blocks); + assert(Blocks.size() == Values.size()); + + // Allow an empty Blocks/Values list to signify getPhiInputs is not + // implemented; in which case no uniformity is possible. + bool HasSingleValue = !Values.empty(); + bool UniformOnPath = HasSingleValue; + + std::optional PhiCommon, PathCommon; + for (unsigned I = 0; I < Blocks.size() && (UniformOnPath || HasSingleValue); + ++I) { + // FIXME: We assume undefs are uniform and/or do not dominate the PHI + // in the presence of other constant or uniform values. + // This particular interpretation of the undef value was added to + // DivergenceAnalysis in the following review: + // + // https://reviews.llvm.org/D19013 + if (!Values[I]) + continue; + + // Track common value for all inputs. + if (!PhiCommon) + PhiCommon = Values[I]; + else if (Values[I] != *PhiCommon) + HasSingleValue = false; + + // Divergent path does not have uniform value. + if (!UniformOnPath) + continue; + + // Only consider predecessors on divergent path. + if (Blocks[I] != &DivTermBlock && + !DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) + continue; + + // Phi is reached via divergent exit (i.e. respect temporal divergence). + if (DivDesc.CycleDivBlocks.contains(Blocks[I])) { + UniformOnPath = false; + continue; + } + + // Phi uniformity is maintained if all values on divergent path match. + if (!PathCommon) + PathCommon = Values[I]; + else if (Values[I] != *PathCommon) { + UniformOnPath = false; + assert(!HasSingleValue); + break; + } + } + + if (UniformOnPath || HasSingleValue) continue; + + LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n"); markDivergent(Phi); } } @@ -1087,7 +1140,7 @@ void GenericUniformityAnalysisImpl::analyzeControlDivergence( DivCycles.push_back(Outermost); continue; } - taintAndPushPhiNodes(*JoinBlock); + taintAndPushPhiNodes(*JoinBlock, *DivTermBlock, DivDesc); } // Sort by order of decreasing depth. This allows later cycles to be skipped diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp index bbbfb3ce2788d..2a378772d93bb 100644 --- a/llvm/lib/CodeGen/MachineSSAContext.cpp +++ b/llvm/lib/CodeGen/MachineSSAContext.cpp @@ -84,6 +84,25 @@ bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) { return true; } +template <> +void MachineSSAContext::getPhiInputs( + const MachineInstr &Phi, SmallVectorImpl &Values, + SmallVectorImpl &Blocks) const { + assert(Phi.isPHI()); + + const MachineRegisterInfo &MRI = Phi.getMF()->getRegInfo(); + // const Register DstReg = Phi.getOperand(0).getReg(); + for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) { + Register Incoming = Phi.getOperand(Idx).getReg(); + MachineInstr *Def = MRI.getVRegDef(Incoming); + // FIXME: should this also consider Incoming == DstReg undef? + if (Def && isUndef(*Def)) + Incoming = ValueRefNull; + Values.push_back(Incoming); + Blocks.push_back(Phi.getOperand(Idx + 1).getMBB()); + } +} + template <> Intrinsic::ID MachineSSAContext::getIntrinsicID(const MachineInstr &MI) { if (auto *GI = dyn_cast(&MI)) diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp index 20b6ea1e972d4..bc35d904ba61f 100644 --- a/llvm/lib/IR/SSAContext.cpp +++ b/llvm/lib/IR/SSAContext.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/SSAContext.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" @@ -68,6 +69,23 @@ bool SSAContext::isConstantOrUndefValuePhi(const Instruction &Instr) { return false; } +template <> +void SSAContext::getPhiInputs( + const Instruction &Instr, SmallVectorImpl &Values, + SmallVectorImpl &Blocks) const { + assert(isa(Instr)); + const PHINode *Phi = static_cast(&Instr); + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) { + const Value *Incoming = Phi->getIncomingValue(I); + const BasicBlock *Block = Phi->getIncomingBlock(I); + // FIXME: should this also consider Incoming == &Instr undef? + if (isa(Incoming)) + Incoming = ValueRefNull; + Values.push_back(Incoming); + Blocks.push_back(Block); + } +} + template <> Intrinsic::ID SSAContext::getIntrinsicID(const Instruction &I) { if (auto *CB = dyn_cast(&I)) return CB->getIntrinsicID(); diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll new file mode 100644 index 0000000000000..be7a5e356e67b --- /dev/null +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll @@ -0,0 +1,121 @@ +; RUN: opt %s -mtriple amdgcn-- -passes='print' -disable-output 2>&1 | FileCheck %s + +; Test PHIs that are uniform because they have a common/constant value over +; the divergent paths. + +; Loop is uniform because loop exit PHI has constant value over all internal +; divergent paths. +define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 { +; CHECK-LABEL: for function 'no_divergent_exit1' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %div.cond = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.cond = + br label %header + +header: + %loop.b = phi i32 [ %b, %entry ], [ %new.b, %body.1 ], [ %new.b, %body.2 ] +; CHECK-NOT: DIVERGENT: %loop.b = + %loop.c = phi i32 [ %c, %entry ], [ %loop.c, %body.1 ], [ %new.c, %body.2 ] +; CHECK: DIVERGENT: %loop.c = + %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %body.1 ], [ %next.exit.val, %body.2 ] +; CHECK-NOT: DIVERGENT: %exit.val = + %exit.cond = icmp slt i32 %exit.val, 42 +; CHECK-NOT: DIVERGENT: %exit.cond = + br i1 %exit.cond, label %end, label %body.1 +; CHECK-NOT: DIVERGENT: br i1 %exit.cond, + +body.1: + %new.b = add i32 %loop.b, 1 +; CHECK-NOT: DIVERGENT: %new.b = + %next.exit.val = add i32 %exit.val, 1 +; CHECK-NOT: DIVERGENT: %next.exit.val = + br i1 %div.cond, label %body.2, label %header +; CHECK: DIVERGENT: br i1 %div.cond, + +body.2: + %new.c = add i32 %loop.c, 1 +; CHECK: DIVERGENT: %new.c = + br label %header + +end: + ret void +} + +; As no_divergent_exit1 but with merge block before exit. +define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 { +; CHECK-LABEL: for function 'no_divergent_exit2' +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %div.cond = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.cond = + br label %header + +header: + %loop.b = phi i32 [ %b, %entry ], [ %merge.b, %merge ] +; CHECK-NOT: DIVERGENT: %loop.b = + %loop.c = phi i32 [ %c, %entry ], [ %merge.c, %merge ] +; CHECK: DIVERGENT: %loop.c = + %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %merge ] +; CHECK-NOT: DIVERGENT: %exit.val = + %exit.cond = icmp slt i32 %exit.val, 42 +; CHECK-NOT: DIVERGENT: %exit.cond = + br i1 %exit.cond, label %end, label %body.1 +; CHECK-NOT: DIVERGENT: br i1 %exit.cond, + +body.1: + %new.b = add i32 %loop.b, 1 +; CHECK-NOT: DIVERGENT: %new.b = + %next.exit.val = add i32 %exit.val, 1 +; CHECK-NOT: DIVERGENT: %next.exit.val = + br i1 %div.cond, label %body.2, label %merge +; CHECK: DIVERGENT: br i1 %div.cond, + +body.2: + %new.c = add i32 %loop.c, 1 +; CHECK: DIVERGENT: %new.c = + br label %merge + +merge: + %merge.b = phi i32 [ %new.b, %body.1 ], [ %new.b, %body.2 ] +; CHECK-NOT: DIVERGENT: %merge.b = + %merge.c = phi i32 [ %loop.c, %body.1 ], [ %new.c, %body.2 ] +; CHECK: DIVERGENT: %merge.c = + br label %header + +end: + ret void +} + +; Test PHI with constant value over divergent path without a loop. +define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %uni.cond = icmp slt i32 %a, 0 +; CHECK-NOT: DIVERGENT: %uni.cond = + %div.cond = icmp slt i32 %tid, 0 +; CHECK: DIVERGENT: %div.cond = + br i1 %uni.cond, label %div.branch.block, label %merge +; CHECK-NOT: DIVERGENT: br i1 %uni.cond, + +div.branch.block: + br i1 %div.cond, label %div.block.1, label %div.block.2 +; CHECK: DIVERGENT: br i1 %div.cond, + +div.block.1: + br label %merge + +div.block.2: + br label %merge + +merge: + %uni.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 1, %div.block.2 ] +; CHECK-NOT: DIVERGENT: %uni.val = + %div.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 2, %div.block.2 ] +; CHECK: DIVERGENT: %div.val = + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone }