llvm · perlfu · Sep 1, 2025 · Sep 25, 2025 · Sep 29, 2025 · Sep 29, 2025
diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -54,7 +54,7 @@ template <typename _FunctionT> class GenericSSAContext {
 
   // The null value for ValueRefT. For LLVM IR and MIR, this is simply the
   // default constructed value.
-  static constexpr ValueRefT *ValueRefNull = {};
+  static constexpr ValueRefT ValueRefNull = {};
 
   // An InstructionT usually defines one or more ValueT objects.
   using InstructionT = typename SSATraits::InstructionT;
@@ -96,6 +96,10 @@ template <typename _FunctionT> class GenericSSAContext {
   static bool isConstantOrUndefValuePhi(const InstructionT &Instr);
   const BlockT *getDefBlock(ConstValueRefT value) const;
 
+  void getPhiInputs(const InstructionT &Instr,
+                    SmallVectorImpl<ConstValueRefT> &Values,
+                    SmallVectorImpl<const BlockT *> &Blocks) const;
+
   Printable print(const BlockT *block) const;
   Printable printAsOperand(const BlockT *BB) const;
   Printable print(const InstructionT *inst) const;

diff --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -455,9 +455,11 @@ template <typename ContextT> class GenericUniformityAnalysisImpl {
   /// the worklist.
   void taintAndPushAllDefs(const BlockT &JoinBlock);
 
-  /// \brief Mark all phi nodes in \p JoinBlock as divergent and push them on
-  /// the worklist.
-  void taintAndPushPhiNodes(const BlockT &JoinBlock);
+  /// \brief Mark phi nodes in \p JoinBlock as divergent and push them on
+  /// the worklist if they are divergent over the by the path \p JoinBlock
+  /// to \p DivTermBlock.
+  void taintAndPushPhiNodes(const BlockT &JoinBlock, const BlockT &DivTermBlock,
+                            const DivergenceDescriptorT &DivDesc);
 
   /// \brief Identify all Instructions that become divergent because \p DivExit
   /// is a divergent cycle exit of \p DivCycle. Mark those instructions as
@@ -917,7 +919,8 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
 /// Mark divergent phi nodes in a join block
 template <typename ContextT>
 void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
-    const BlockT &JoinBlock) {
+    const BlockT &JoinBlock, const BlockT &DivTermBlock,
+    const DivergenceDescriptorT &DivDesc) {
   LLVM_DEBUG(dbgs() << "taintAndPushPhiNodes in " << Context.print(&JoinBlock)
                     << "\n");
   for (const auto &Phi : JoinBlock.phis()) {
@@ -930,6 +933,44 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushPhiNodes(
     // https://reviews.llvm.org/D19013
     if (ContextT::isConstantOrUndefValuePhi(Phi))
       continue;
+
+    // Attempt to maintain uniformity for PHIs by considering control
+    // dependencies.
+    SmallVector<ConstValueRefT> Values;
+    SmallVector<const BlockT *> Blocks;
+    Context.getPhiInputs(Phi, Values, Blocks);
+    assert(Blocks.size() == Values.size());
+
+    // Allow an empty Blocks/Values list to signify getPhiInputs is not
+    // implemented; in which case no uniformity is possible.
+    bool Uniform = !Values.empty();
+
+    std::optional<ConstValueRefT> CommonValue;
+    for (unsigned I = 0; I < Blocks.size() && Uniform; ++I) {
+      if (DivDesc.CycleDivBlocks.contains(Blocks[I])) {
+        // If PHI is reachable via divergent exit it is divergent.
+        Uniform = false;
+      } else if (DT.dominates(&DivTermBlock, Blocks[I]) ||
+                 DivDesc.BlockLabels.lookup_or(Blocks[I], nullptr)) {
+        // If all edges from the marked path share a common value then
+        // uniformity is preserved when the value is itself uniform.
+        if (!CommonValue)
+          CommonValue = Values[I];
+        else
+          Uniform = Values[I] == *CommonValue;
+      }
+      // Ignore undefined values when checking definitions.
+      if (!Values[I])
+        continue;
+      // Any value defined on the divergent path is divergent.
+      const BlockT *DefBlock = Context.getDefBlock(Values[I]);
+      if (DivDesc.BlockLabels.lookup_or(DefBlock, nullptr))
+        Uniform = false;
+    }
+    if (Uniform)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "tainted: " << Phi << "\n");
     markDivergent(Phi);
   }
 }
@@ -1087,7 +1128,7 @@ void GenericUniformityAnalysisImpl<ContextT>::analyzeControlDivergence(
       DivCycles.push_back(Outermost);
       continue;
     }
-    taintAndPushPhiNodes(*JoinBlock);
+    taintAndPushPhiNodes(*JoinBlock, *DivTermBlock, DivDesc);
   }
 
   // Sort by order of decreasing depth. This allows later cycles to be skipped

diff --git a/llvm/lib/CodeGen/MachineSSAContext.cpp b/llvm/lib/CodeGen/MachineSSAContext.cpp
@@ -84,6 +84,20 @@ bool MachineSSAContext::isConstantOrUndefValuePhi(const MachineInstr &Phi) {
   return true;
 }
 
+template <>
+void MachineSSAContext::getPhiInputs(
+    const MachineInstr &Phi, SmallVectorImpl<Register> &Values,
+    SmallVectorImpl<const MachineBasicBlock *> &Blocks) const {
+  if (!Phi.isPHI())
+    return;
+  for (unsigned Idx = 1, End = Phi.getNumOperands(); Idx < End; Idx += 2) {
+    // FIXME: ideally we would turn undef values into ValueRefNull.
+    // This could reduce number of PHIs marked in taintAndPushPhiNodes().
+    Values.push_back(Phi.getOperand(Idx).getReg());
+    Blocks.push_back(Phi.getOperand(Idx + 1).getMBB());
+  }
+}
+
 template <>
 Intrinsic::ID MachineSSAContext::getIntrinsicID(const MachineInstr &MI) {
   if (auto *GI = dyn_cast<GIntrinsic>(&MI))

diff --git a/llvm/lib/IR/SSAContext.cpp b/llvm/lib/IR/SSAContext.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/IR/SSAContext.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
@@ -68,6 +69,20 @@ bool SSAContext::isConstantOrUndefValuePhi(const Instruction &Instr) {
   return false;
 }
 
+template <>
+void SSAContext::getPhiInputs(
+    const Instruction &Instr, SmallVectorImpl<const Value *> &Values,
+    SmallVectorImpl<const BasicBlock *> &Blocks) const {
+  if (auto *Phi = dyn_cast<PHINode>(&Instr)) {
+    for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
+      const Value *Incoming = Phi->getIncomingValue(I);
+      const BasicBlock *Block = Phi->getIncomingBlock(I);
+      Values.push_back(!isa<UndefValue>(Incoming) ? Incoming : ValueRefNull);
+      Blocks.push_back(Block);
+    }
+  }
+}
+
 template <> Intrinsic::ID SSAContext::getIntrinsicID(const Instruction &I) {
   if (auto *CB = dyn_cast<CallBase>(&I))
     return CB->getIntrinsicID();

diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/phi_no_divergence.ll
@@ -0,0 +1,114 @@
+; RUN: opt %s -mtriple amdgcn-- -passes='print<uniformity>' -disable-output 2>&1 | FileCheck %s
+
+define amdgpu_kernel void @no_divergent_exit1(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit1'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %body.1 ], [ %next.exit.val, %body.2 ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %header
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_divergent_exit2(i32 %a, i32 %b, i32 %c) #0 {
+; CHECK-LABEL: for function 'no_divergent_exit2'
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br label %header
+
+header:
+  %loop.b = phi i32 [ %b, %entry ], [ %merge.b, %merge ]
+; CHECK-NOT: DIVERGENT: %loop.b =
+  %loop.c = phi i32 [ %c, %entry ], [ %merge.c, %merge ]
+; CHECK: DIVERGENT: %loop.c =
+  %exit.val = phi i32 [ %a, %entry ], [ %next.exit.val, %merge ]
+; CHECK-NOT: DIVERGENT: %exit.val =
+  %exit.cond = icmp slt i32 %exit.val, 42
+; CHECK-NOT: DIVERGENT: %exit.cond =
+  br i1 %exit.cond, label %end, label %body.1
+; CHECK-NOT: DIVERGENT: br i1 %exit.cond,
+
+body.1:
+  %new.b = add i32 %loop.b, 1
+; CHECK-NOT: DIVERGENT: %new.b =
+  %next.exit.val = add i32 %exit.val, 1
+; CHECK-NOT: DIVERGENT: %next.exit.val =
+  br i1 %div.cond, label %body.2, label %merge
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+body.2:
+  %new.c = add i32 %loop.c, 1
+; CHECK: DIVERGENT: %new.c =
+  br label %merge
+
+merge:
+  %merge.b = phi i32 [ %new.b, %body.1 ], [ %new.b, %body.2 ]
+; CHECK-NOT: DIVERGENT: %merge.b =
+  %merge.c = phi i32 [ %loop.c, %body.1 ], [ %new.c, %body.2 ]
+; CHECK: DIVERGENT: %merge.c =
+  br label %header
+
+end:
+  ret void
+}
+
+define amdgpu_kernel void @no_loop_phi_divergence(i32 %a) #0 {
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %uni.cond = icmp slt i32 %a, 0
+; CHECK-NOT: DIVERGENT: %uni.cond =
+  %div.cond = icmp slt i32 %tid, 0
+; CHECK: DIVERGENT: %div.cond =
+  br i1 %uni.cond, label %div.branch.block, label %merge
+; CHECK-NOT: DIVERGENT: br i1 %uni.cond,
+
+div.branch.block:
+  br i1 %div.cond, label %div.block.1, label %div.block.2
+; CHECK: DIVERGENT: br i1 %div.cond,
+
+div.block.1:
+  br label %merge
+
+div.block.2:
+  br label %merge
+
+merge:
+  %uni.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 1, %div.block.2 ]
+; CHECK-NOT: DIVERGENT: %uni.val =
+  %div.val = phi i32 [ 0, %entry ], [ 1, %div.block.1 ], [ 2, %div.block.2 ]
+; CHECK: DIVERGENT: %div.val =
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }