Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 8 additions & 6 deletions llvm/include/llvm/Transforms/Utils/LoopRotationUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#ifndef LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H
#define LLVM_TRANSFORMS_UTILS_LOOPROTATIONUTILS_H

#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Compiler.h"

namespace llvm {
Expand All @@ -32,12 +33,13 @@ class TargetTransformInfo;
/// header. If the loop header's size exceeds the threshold, the loop rotation
/// will give up. The flag IsUtilMode controls the heuristic used in the
/// LoopRotation. If it is true, the profitability heuristic will be ignored.
LLVM_ABI bool LoopRotation(Loop *L, LoopInfo *LI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
DominatorTree *DT, ScalarEvolution *SE,
MemorySSAUpdater *MSSAU, const SimplifyQuery &SQ,
bool RotationOnly, unsigned Threshold,
bool IsUtilMode, bool PrepareForLTO = false);
LLVM_ABI bool LoopRotation(
Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly, unsigned Threshold,
bool IsUtilMode, bool PrepareForLTO = false,
function_ref<bool(Loop *, ScalarEvolution *)> profitabilityCheck =
[](Loop *, ScalarEvolution *) { return false; });

} // namespace llvm

Expand Down
28 changes: 16 additions & 12 deletions llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,16 +69,19 @@ class LoopRotate {
bool RotationOnly;
bool IsUtilMode;
bool PrepareForLTO;
function_ref<bool(Loop *, ScalarEvolution *)> profitabilityCheck;

public:
LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
const TargetTransformInfo *TTI, AssumptionCache *AC,
DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
bool PrepareForLTO)
bool PrepareForLTO,
function_ref<bool(Loop *, ScalarEvolution *)> profitabilityCheck)
: MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO),
profitabilityCheck(profitabilityCheck) {}
bool processLoop(Loop *L);

private:
Expand Down Expand Up @@ -440,9 +443,9 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {

// Rotate if either the loop latch does *not* exit the loop, or if the loop
// latch was just simplified. Or if we think it will be profitable.
if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
!profitableToRotateLoopExitingLatch(L) &&
!canRotateDeoptimizingLatchExit(L))
if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch &&
IsUtilMode == false && !profitableToRotateLoopExitingLatch(L) &&
!canRotateDeoptimizingLatchExit(L) && !profitabilityCheck(L, SE))
return Rotated;

// Check size of original header and reject loop if it is very big or we can't
Expand Down Expand Up @@ -1053,13 +1056,14 @@ bool LoopRotate::processLoop(Loop *L) {


/// The utility to convert a loop into a loop with bottom test.
bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
AssumptionCache *AC, DominatorTree *DT,
ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly = true,
unsigned Threshold = unsigned(-1),
bool IsUtilMode = true, bool PrepareForLTO) {
bool llvm::LoopRotation(
Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly = true,
unsigned Threshold = unsigned(-1), bool IsUtilMode = true,
bool PrepareForLTO,
function_ref<bool(Loop *, ScalarEvolution *)> profitabilityCheck) {
LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
IsUtilMode, PrepareForLTO);
IsUtilMode, PrepareForLTO, profitabilityCheck);
return LR.processLoop(L);
}
24 changes: 22 additions & 2 deletions llvm/lib/Transforms/Utils/LoopUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopRotationUtils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/SimplifyIndVar.h"
Expand Down Expand Up @@ -484,8 +485,27 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,

assert(ULO.Count > 0);

// All these values should be taken only after peeling because they might have
// changed.
if (ULO.Runtime && SE) {
BasicBlock *OrigHeader = L->getHeader();
Copy link

Copilot AI Jul 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Before attempting rotation, check that the loop has a preheader and a latch block to avoid potential null-pointer assertions inside LoopRotation.

Suggested change
BasicBlock *OrigHeader = L->getHeader();
BasicBlock *OrigHeader = L->getHeader();
if (!L->getLoopPreheader()) {
LLVM_DEBUG(dbgs() << " Can't rotate loop; missing preheader.\n");
return LoopUnrollResult::Unmodified;
}

Copilot uses AI. Check for mistakes.

BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
// Rotate loop if it makes it countable (for later unrolling)
if (BI && !BI->isUnconditional() &&
isa<SCEVCouldNotCompute>(SE->getExitCount(L, L->getLoopLatch())) &&
!isa<SCEVCouldNotCompute>(SE->getExitCount(L, OrigHeader))) {
LLVM_DEBUG(dbgs() << " Rotating loop to make the loop countable.\n");
SimplifyQuery SQ{OrigHeader->getDataLayout()};
SQ.TLI = nullptr;
SQ.DT = DT;
SQ.AC = AC;
llvm::LoopRotation(L, LI, TTI, AC, DT, SE, nullptr /*MemorySSAUpdater*/,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to first perform the profitability checks on the unrotated form, and then only rotate if runtime unrolling is profitable?

Otherwise we may rotate w/o actually unrolling the loop. Also, if we do it here, we need to indicate the the IR has been changed even if nothing gets unrolled

Copy link
Contributor Author

@mark-sed mark-sed Jul 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When adding the rotation I tried to push it as far back as possible to not rotate when not necessary, but this of course might still happen. I looked if it would be possible to harness some of the legality/profitability checks before the rotation, but I can't really see any good way to do this as it relies on the rotated loop structure.
Only possible one is to check if the loop is in simplified form.

I could also try to move this rotation into the UnrollRuntimeLoopRemainder and then recompute all the resources used in unroll pass, which are currently collected after the rotation, but I don't think that would change much.

You are correct with the indication. I'll add a new result for when the rotation happens.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could also try to move this rotation into the UnrollRuntimeLoopRemainder and then recompute all the resources used in unroll pass, which are currently collected after the rotation, but I don't think that would change much.

Just to clarify, even if this was done, there is no guarantee that unrolling will definitely happen if we were to rotate. There are further checks on the loop within UnrollRuntimeLoopRemainder which would need to be rerun on the rotated loop.

The check where we bail out in runtime unrolling's UnrollRuntimeLoopRemainder without rotation is here:

   // Add 1 since the backedge count doesn't include the first loop iteration.
    // (Note that overflow can occur, this is handled explicitly below)
    const SCEV *TripCountSC =
        SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1)); 
    if (isa<SCEVCouldNotCompute>(TripCountSC)) {
      LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
      return false;
    }

We then use this info TripCountSC to do further legality checks on runtime unrolling

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, but that's just because we check the TC of the latch block, right? So we should be able to get the right trip count we would have after rotating, before actually doing the rotation

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we could get the right trip count and that was the original place where we decided to do the rotation ( i.e. if that check failed, only then do the rotation). The remaining checks after that are sort of trivial and can be computed as-if the loop was rotated as well.

It had worked fine - the loop was only rotated once all the other checks passed and we also unrolled the loop. However, the caller of runtime unrolling (llvm::UnrollLoop) expected the loop structure to not be changed in this manner (it precomputed the LoopHeader, latch etc and some additional properties on top of it).

One thing we had thought of is: rotate the loop within runtime unrolling, then set a flag that the loop was rotated.
In llvm::UnrollLoop in loopUnroll.cpp, we would need to recompute the structures which were previously computed since now rotation was done.

In short, the callers of UnrollRuntimeLoopRemainder do not expect rotation to have happened. Another such caller is LoopUnrollAndJam.

@marek-sed, can you pls try the previous MR we had and see if we could recompute the structures.

Copy link
Contributor Author

@mark-sed mark-sed Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have tested moving the rotation into the UnrollRuntimeLoopRemainder and the recalculating is way simpler than I first thought simply because a bunch of the calculations can be moved after the call to UnrollRuntimeLoopRemainder (e.g. the ExitInfo) and so pretty much only LatchBlock has to be recomputed.
I'll create and MR with this approach.

SQ, false /*RotationOnly*/, 16 /*Threshold*/,
false /*IsUtilMode*/, false /*PrepareForLTO*/,
[](Loop *, ScalarEvolution *) { return true; });
}
}

// All these values should be taken only after peeling or loop rotation
// because they might have changed.
BasicBlock *Preheader = L->getLoopPreheader();
BasicBlock *Header = L->getHeader();
BasicBlock *LatchBlock = L->getLoopLatch();
Expand Down
99 changes: 99 additions & 0 deletions llvm/test/Transforms/LoopUnroll/X86/runtime-unroll-after-rotate.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt --passes=loop-unroll -unroll-runtime-other-exit-predictable=1 -S %s | FileCheck %s
target triple = "x86_64-unknown-linux-gnu"

define void @test(i64 %0) #0 {
; CHECK-LABEL: define void @test(
; CHECK-SAME: i64 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[B1:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: br i1 [[B1]], label %[[AFTER:.*]], label %[[BODY_LR_PH:.*]]
; CHECK: [[BODY_LR_PH]]:
; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = freeze i64 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], -1
; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP2]], 3
; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; CHECK-NEXT: br i1 [[LCMP_MOD]], label %[[BODY_PROL_PREHEADER:.*]], label %[[BODY_PROL_LOOPEXIT:.*]]
; CHECK: [[BODY_PROL_PREHEADER]]:
; CHECK-NEXT: br label %[[BODY_PROL:.*]]
; CHECK: [[BODY_PROL]]:
; CHECK-NEXT: [[A2_PROL:%.*]] = phi i64 [ [[TMP0]], %[[BODY_PROL_PREHEADER]] ], [ [[A_PROL:%.*]], %[[HEADER_PROL:.*]] ]
; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ 0, %[[BODY_PROL_PREHEADER]] ], [ [[PROL_ITER_NEXT:%.*]], %[[HEADER_PROL]] ]
; CHECK-NEXT: [[C_PROL:%.*]] = add i64 [[A2_PROL]], 1
; CHECK-NEXT: [[D_PROL:%.*]] = load i32, ptr addrspace(1) null, align 4
; CHECK-NEXT: [[E_PROL:%.*]] = icmp eq i32 [[D_PROL]], 0
; CHECK-NEXT: br i1 [[E_PROL]], label %[[END_LOOPEXIT3:.*]], label %[[HEADER_PROL]]
; CHECK: [[HEADER_PROL]]:
; CHECK-NEXT: [[A_PROL]] = phi i64 [ [[C_PROL]], %[[BODY_PROL]] ]
; CHECK-NEXT: [[B_PROL:%.*]] = icmp eq i64 [[A_PROL]], 0
; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label %[[BODY_PROL]], label %[[BODY_PROL_LOOPEXIT_UNR_LCSSA:.*]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: [[BODY_PROL_LOOPEXIT_UNR_LCSSA]]:
; CHECK-NEXT: [[A2_UNR_PH:%.*]] = phi i64 [ [[A_PROL]], %[[HEADER_PROL]] ]
; CHECK-NEXT: br label %[[BODY_PROL_LOOPEXIT]]
; CHECK: [[BODY_PROL_LOOPEXIT]]:
; CHECK-NEXT: [[A2_UNR:%.*]] = phi i64 [ [[TMP0]], %[[BODY_LR_PH]] ], [ [[A2_UNR_PH]], %[[BODY_PROL_LOOPEXIT_UNR_LCSSA]] ]
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 3
; CHECK-NEXT: br i1 [[TMP4]], label %[[HEADER_AFTER_CRIT_EDGE:.*]], label %[[BODY_LR_PH_NEW:.*]]
; CHECK: [[BODY_LR_PH_NEW]]:
; CHECK-NEXT: br label %[[BODY:.*]]
; CHECK: [[HEADER:.*]]:
; CHECK-NEXT: br i1 false, label %[[END_LOOPEXIT:.*]], label %[[HEADER_1:.*]]
; CHECK: [[HEADER_1]]:
; CHECK-NEXT: br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_2:.*]]
; CHECK: [[HEADER_2]]:
; CHECK-NEXT: [[C_7:%.*]] = add i64 [[A2:%.*]], 4
; CHECK-NEXT: br i1 false, label %[[END_LOOPEXIT]], label %[[HEADER_3:.*]]
; CHECK: [[HEADER_3]]:
; CHECK-NEXT: [[B_7:%.*]] = icmp eq i64 [[C_7]], 0
; CHECK-NEXT: br i1 [[B_7]], label %[[HEADER_AFTER_CRIT_EDGE_UNR_LCSSA:.*]], label %[[BODY]]
; CHECK: [[BODY]]:
; CHECK-NEXT: [[A2]] = phi i64 [ [[A2_UNR]], %[[BODY_LR_PH_NEW]] ], [ [[C_7]], %[[HEADER_3]] ]
; CHECK-NEXT: [[D:%.*]] = load i32, ptr addrspace(1) null, align 4
; CHECK-NEXT: [[E:%.*]] = icmp eq i32 [[D]], 0
; CHECK-NEXT: br i1 [[E]], label %[[END_LOOPEXIT]], label %[[HEADER]]
; CHECK: [[END_LOOPEXIT]]:
; CHECK-NEXT: br label %[[END:.*]]
; CHECK: [[END_LOOPEXIT3]]:
; CHECK-NEXT: br label %[[END]]
; CHECK: [[END]]:
; CHECK-NEXT: ret void
; CHECK: [[HEADER_AFTER_CRIT_EDGE_UNR_LCSSA]]:
; CHECK-NEXT: br label %[[HEADER_AFTER_CRIT_EDGE]]
; CHECK: [[HEADER_AFTER_CRIT_EDGE]]:
; CHECK-NEXT: br label %[[AFTER]]
; CHECK: [[AFTER]]:
; CHECK-NEXT: call void @foo(i32 0)
; CHECK-NEXT: ret void
;
entry:
br label %header

header:
%a = phi i64 [ %0, %entry ], [ %c, %body ]
%b = icmp eq i64 %a, 0
br i1 %b, label %after, label %body

body:
%c = add i64 %a, 1
%d = load i32, ptr addrspace(1) null, align 4
%e = icmp eq i32 %d, 0
br i1 %e, label %end, label %header

end:
ret void

after:
call void @foo(i32 0)
ret void
}

declare void @foo(i32)

attributes #0 = { "tune-cpu"="generic" }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not be needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When not set, on my machine, the cpu chosen is "i586" and from what I can see in UnrollingPreferences it has different values from the "generic" one and so the runtime unrolling code is not even run. I suppose it is because of the disabled Runtime flag.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

try adding -unroll-runtime=true -unroll-runtime-multi-exit=true.

Also, it would be nice to precommit this change in a separate MR with all the options (including the other-exit-predictable), so that we can see the patch with rotation helps to runtime unroll the loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@annamthomas Thank you adding the -unroll-runtime flag helped.

;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.unroll.disable"}
;.
Loading
Loading