|  | 
| 16 | 16 | #include "llvm/ADT/DenseMap.h" | 
| 17 | 17 | #include "llvm/ADT/STLExtras.h" | 
| 18 | 18 | #include "llvm/ADT/ScopeExit.h" | 
|  | 19 | +#include "llvm/ADT/SmallVector.h" | 
| 19 | 20 | #include "llvm/ADT/Statistic.h" | 
| 20 | 21 | #include "llvm/Analysis/AssumptionCache.h" | 
| 21 | 22 | #include "llvm/Analysis/BasicAliasAnalysis.h" | 
|  | 
| 29 | 30 | #include "llvm/IR/Dominators.h" | 
| 30 | 31 | #include "llvm/IR/Function.h" | 
| 31 | 32 | #include "llvm/IR/IRBuilder.h" | 
|  | 33 | +#include "llvm/IR/Instructions.h" | 
| 32 | 34 | #include "llvm/IR/PatternMatch.h" | 
| 33 | 35 | #include "llvm/Support/CommandLine.h" | 
| 34 | 36 | #include "llvm/Transforms/Utils/Local.h" | 
| 35 | 37 | #include "llvm/Transforms/Utils/LoopUtils.h" | 
| 36 | 38 | #include <numeric> | 
|  | 39 | +#include <optional> | 
| 37 | 40 | #include <queue> | 
| 38 | 41 | #include <set> | 
|  | 42 | +#include <tuple> | 
| 39 | 43 | 
 | 
| 40 | 44 | #define DEBUG_TYPE "vector-combine" | 
| 41 | 45 | #include "llvm/Transforms/Utils/InstructionWorklist.h" | 
| @@ -137,6 +141,7 @@ class VectorCombine { | 
| 137 | 141 |   bool foldSelectShuffle(Instruction &I, bool FromReduction = false); | 
| 138 | 142 |   bool foldInterleaveIntrinsics(Instruction &I); | 
| 139 | 143 |   bool shrinkType(Instruction &I); | 
|  | 144 | +  bool shrinkLoadForShuffles(Instruction &I); | 
| 140 | 145 | 
 | 
| 141 | 146 |   void replaceValue(Value &Old, Value &New) { | 
| 142 | 147 |     LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); | 
| @@ -3862,6 +3867,133 @@ bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) { | 
| 3862 | 3867 |   return true; | 
| 3863 | 3868 | } | 
| 3864 | 3869 | 
 | 
|  | 3870 | +// Attempt to shrink loads that are only used by shufflevector instructions. | 
|  | 3871 | +bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { | 
|  | 3872 | +  auto *OldLoad = dyn_cast<LoadInst>(&I); | 
|  | 3873 | +  if (!OldLoad || !OldLoad->isSimple()) | 
|  | 3874 | +    return false; | 
|  | 3875 | + | 
|  | 3876 | +  auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType()); | 
|  | 3877 | +  if (!OldLoadTy) | 
|  | 3878 | +    return false; | 
|  | 3879 | + | 
|  | 3880 | +  unsigned const OldNumElements = OldLoadTy->getNumElements(); | 
|  | 3881 | + | 
|  | 3882 | +  // Search all uses of load. If all uses are shufflevector instructions, and | 
|  | 3883 | +  // the second operands are all poison values, find the minimum and maximum | 
|  | 3884 | +  // indices of the vector elements referenced by all shuffle masks. | 
|  | 3885 | +  // Otherwise return `std::nullopt`. | 
|  | 3886 | +  using IndexRange = std::pair<int, int>; | 
|  | 3887 | +  auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> { | 
|  | 3888 | +    IndexRange OutputRange = IndexRange(OldNumElements, -1); | 
|  | 3889 | +    for (llvm::Use &Use : I.uses()) { | 
|  | 3890 | +      // Ensure all uses match the required pattern. | 
|  | 3891 | +      User *Shuffle = Use.getUser(); | 
|  | 3892 | +      ArrayRef<int> Mask; | 
|  | 3893 | + | 
|  | 3894 | +      if (!match(Shuffle, | 
|  | 3895 | +                 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask)))) | 
|  | 3896 | +        return std::nullopt; | 
|  | 3897 | + | 
|  | 3898 | +      // Ignore shufflevector instructions that have no uses. | 
|  | 3899 | +      if (Shuffle->use_empty()) | 
|  | 3900 | +        continue; | 
|  | 3901 | + | 
|  | 3902 | +      // Find the min and max indices used by the shufflevector instruction. | 
|  | 3903 | +      for (int Index : Mask) { | 
|  | 3904 | +        if (Index >= 0 && Index < static_cast<int>(OldNumElements)) { | 
|  | 3905 | +          OutputRange.first = std::min(Index, OutputRange.first); | 
|  | 3906 | +          OutputRange.second = std::max(Index, OutputRange.second); | 
|  | 3907 | +        } | 
|  | 3908 | +      } | 
|  | 3909 | +    } | 
|  | 3910 | + | 
|  | 3911 | +    if (OutputRange.second < OutputRange.first) | 
|  | 3912 | +      return std::nullopt; | 
|  | 3913 | + | 
|  | 3914 | +    return OutputRange; | 
|  | 3915 | +  }; | 
|  | 3916 | + | 
|  | 3917 | +  // Get the range of vector elements used by shufflevector instructions. | 
|  | 3918 | +  if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) { | 
|  | 3919 | +    unsigned const NewNumElements = Indices->second + 1u; | 
|  | 3920 | + | 
|  | 3921 | +    // If the range of vector elements is smaller than the full load, attempt | 
|  | 3922 | +    // to create a smaller load. | 
|  | 3923 | +    if (NewNumElements < OldNumElements) { | 
|  | 3924 | +      IRBuilder Builder(&I); | 
|  | 3925 | +      Builder.SetCurrentDebugLocation(I.getDebugLoc()); | 
|  | 3926 | + | 
|  | 3927 | +      // Calculate costs of old and new ops. | 
|  | 3928 | +      Type *ElemTy = OldLoadTy->getElementType(); | 
|  | 3929 | +      FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements); | 
|  | 3930 | +      Value *PtrOp = OldLoad->getPointerOperand(); | 
|  | 3931 | + | 
|  | 3932 | +      InstructionCost OldCost = TTI.getMemoryOpCost( | 
|  | 3933 | +          Instruction::Load, OldLoad->getType(), OldLoad->getAlign(), | 
|  | 3934 | +          OldLoad->getPointerAddressSpace(), CostKind); | 
|  | 3935 | +      InstructionCost NewCost = | 
|  | 3936 | +          TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(), | 
|  | 3937 | +                              OldLoad->getPointerAddressSpace(), CostKind); | 
|  | 3938 | + | 
|  | 3939 | +      using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>; | 
|  | 3940 | +      SmallVector<UseEntry, 4u> NewUses; | 
|  | 3941 | +      unsigned const MaxIndex = NewNumElements * 2u; | 
|  | 3942 | + | 
|  | 3943 | +      for (llvm::Use &Use : I.uses()) { | 
|  | 3944 | +        auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser()); | 
|  | 3945 | +        ArrayRef<int> OldMask = Shuffle->getShuffleMask(); | 
|  | 3946 | + | 
|  | 3947 | +        // Create entry for new use. | 
|  | 3948 | +        NewUses.push_back({Shuffle, OldMask}); | 
|  | 3949 | + | 
|  | 3950 | +        // Validate mask indices. | 
|  | 3951 | +        for (int Index : OldMask) { | 
|  | 3952 | +          if (Index >= static_cast<int>(MaxIndex)) | 
|  | 3953 | +            return false; | 
|  | 3954 | +        } | 
|  | 3955 | + | 
|  | 3956 | +        // Update costs. | 
|  | 3957 | +        OldCost += | 
|  | 3958 | +            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), | 
|  | 3959 | +                               OldLoadTy, OldMask, CostKind); | 
|  | 3960 | +        NewCost += | 
|  | 3961 | +            TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(), | 
|  | 3962 | +                               NewLoadTy, OldMask, CostKind); | 
|  | 3963 | +      } | 
|  | 3964 | + | 
|  | 3965 | +      LLVM_DEBUG( | 
|  | 3966 | +          dbgs() << "Found a load used only by shufflevector instructions: " | 
|  | 3967 | +                 << I << "\n  OldCost: " << OldCost | 
|  | 3968 | +                 << " vs NewCost: " << NewCost << "\n"); | 
|  | 3969 | + | 
|  | 3970 | +      if (OldCost < NewCost || !NewCost.isValid()) | 
|  | 3971 | +        return false; | 
|  | 3972 | + | 
|  | 3973 | +      // Create new load of smaller vector. | 
|  | 3974 | +      auto *NewLoad = cast<LoadInst>( | 
|  | 3975 | +          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign())); | 
|  | 3976 | +      NewLoad->copyMetadata(I); | 
|  | 3977 | + | 
|  | 3978 | +      // Replace all uses. | 
|  | 3979 | +      for (UseEntry &Use : NewUses) { | 
|  | 3980 | +        ShuffleVectorInst *Shuffle = Use.first; | 
|  | 3981 | +        std::vector<int> &NewMask = Use.second; | 
|  | 3982 | + | 
|  | 3983 | +        Builder.SetInsertPoint(Shuffle); | 
|  | 3984 | +        Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc()); | 
|  | 3985 | +        Value *NewShuffle = Builder.CreateShuffleVector( | 
|  | 3986 | +            NewLoad, PoisonValue::get(NewLoadTy), NewMask); | 
|  | 3987 | + | 
|  | 3988 | +        replaceValue(*Shuffle, *NewShuffle); | 
|  | 3989 | +      } | 
|  | 3990 | + | 
|  | 3991 | +      return true; | 
|  | 3992 | +    } | 
|  | 3993 | +  } | 
|  | 3994 | +  return false; | 
|  | 3995 | +} | 
|  | 3996 | + | 
| 3865 | 3997 | /// This is the entry point for all transforms. Pass manager differences are | 
| 3866 | 3998 | /// handled in the callers of this function. | 
| 3867 | 3999 | bool VectorCombine::run() { | 
| @@ -3938,6 +4070,9 @@ bool VectorCombine::run() { | 
| 3938 | 4070 |         MadeChange |= foldSelectShuffle(I); | 
| 3939 | 4071 |         MadeChange |= foldShuffleToIdentity(I); | 
| 3940 | 4072 |         break; | 
|  | 4073 | +      case Instruction::Load: | 
|  | 4074 | +        MadeChange |= shrinkLoadForShuffles(I); | 
|  | 4075 | +        break; | 
| 3941 | 4076 |       case Instruction::BitCast: | 
| 3942 | 4077 |         MadeChange |= foldBitcastShuffle(I); | 
| 3943 | 4078 |         break; | 
|  | 
0 commit comments