LoopDistribute.cpp 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076
  1. //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Loop Distribution Pass. Its main focus is to
  11. // distribute loops that cannot be vectorized due to dependence cycles. It
  12. // tries to isolate the offending dependences into a new loop allowing
  13. // vectorization of the remaining parts.
  14. //
  15. // For dependence analysis, the pass uses the LoopVectorizer's
  16. // LoopAccessAnalysis. Because this analysis presumes no change in the order of
  17. // memory operations, special care is taken to preserve the lexical order of
  18. // these operations.
  19. //
  20. // Similarly to the Vectorizer, the pass also supports loop versioning to
  21. // run-time disambiguate potentially overlapping arrays.
  22. //
  23. //===----------------------------------------------------------------------===//
  24. #include "llvm/Transforms/Scalar/LoopDistribute.h"
  25. #include "llvm/ADT/DenseMap.h"
  26. #include "llvm/ADT/DepthFirstIterator.h"
  27. #include "llvm/ADT/EquivalenceClasses.h"
  28. #include "llvm/ADT/Optional.h"
  29. #include "llvm/ADT/STLExtras.h"
  30. #include "llvm/ADT/SmallPtrSet.h"
  31. #include "llvm/ADT/SmallVector.h"
  32. #include "llvm/ADT/Statistic.h"
  33. #include "llvm/ADT/StringRef.h"
  34. #include "llvm/ADT/Twine.h"
  35. #include "llvm/ADT/iterator_range.h"
  36. #include "llvm/Analysis/AliasAnalysis.h"
  37. #include "llvm/Analysis/AssumptionCache.h"
  38. #include "llvm/Analysis/GlobalsModRef.h"
  39. #include "llvm/Analysis/LoopAccessAnalysis.h"
  40. #include "llvm/Analysis/LoopAnalysisManager.h"
  41. #include "llvm/Analysis/LoopInfo.h"
  42. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  43. #include "llvm/Analysis/ScalarEvolution.h"
  44. #include "llvm/Analysis/TargetLibraryInfo.h"
  45. #include "llvm/Analysis/TargetTransformInfo.h"
  46. #include "llvm/IR/BasicBlock.h"
  47. #include "llvm/IR/Constants.h"
  48. #include "llvm/IR/DiagnosticInfo.h"
  49. #include "llvm/IR/Dominators.h"
  50. #include "llvm/IR/Function.h"
  51. #include "llvm/IR/InstrTypes.h"
  52. #include "llvm/IR/Instruction.h"
  53. #include "llvm/IR/Instructions.h"
  54. #include "llvm/IR/LLVMContext.h"
  55. #include "llvm/IR/Metadata.h"
  56. #include "llvm/IR/PassManager.h"
  57. #include "llvm/IR/Value.h"
  58. #include "llvm/Pass.h"
  59. #include "llvm/Support/Casting.h"
  60. #include "llvm/Support/CommandLine.h"
  61. #include "llvm/Support/Debug.h"
  62. #include "llvm/Support/raw_ostream.h"
  63. #include "llvm/Transforms/Scalar.h"
  64. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  65. #include "llvm/Transforms/Utils/Cloning.h"
  66. #include "llvm/Transforms/Utils/LoopUtils.h"
  67. #include "llvm/Transforms/Utils/LoopVersioning.h"
  68. #include "llvm/Transforms/Utils/ValueMapper.h"
  69. #include <cassert>
  70. #include <functional>
  71. #include <list>
  72. #include <tuple>
  73. #include <utility>
  74. using namespace llvm;
  75. #define LDIST_NAME "loop-distribute"
  76. #define DEBUG_TYPE LDIST_NAME
  77. /// @{
  78. /// Metadata attribute names
  79. static const char *const LLVMLoopDistributeFollowupAll =
  80. "llvm.loop.distribute.followup_all";
  81. static const char *const LLVMLoopDistributeFollowupCoincident =
  82. "llvm.loop.distribute.followup_coincident";
  83. static const char *const LLVMLoopDistributeFollowupSequential =
  84. "llvm.loop.distribute.followup_sequential";
  85. static const char *const LLVMLoopDistributeFollowupFallback =
  86. "llvm.loop.distribute.followup_fallback";
  87. /// @}
  88. static cl::opt<bool>
  89. LDistVerify("loop-distribute-verify", cl::Hidden,
  90. cl::desc("Turn on DominatorTree and LoopInfo verification "
  91. "after Loop Distribution"),
  92. cl::init(false));
  93. static cl::opt<bool> DistributeNonIfConvertible(
  94. "loop-distribute-non-if-convertible", cl::Hidden,
  95. cl::desc("Whether to distribute into a loop that may not be "
  96. "if-convertible by the loop vectorizer"),
  97. cl::init(false));
  98. static cl::opt<unsigned> DistributeSCEVCheckThreshold(
  99. "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
  100. cl::desc("The maximum number of SCEV checks allowed for Loop "
  101. "Distribution"));
  102. static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
  103. "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
  104. cl::Hidden,
  105. cl::desc(
  106. "The maximum number of SCEV checks allowed for Loop "
  107. "Distribution for loop marked with #pragma loop distribute(enable)"));
  108. static cl::opt<bool> EnableLoopDistribute(
  109. "enable-loop-distribute", cl::Hidden,
  110. cl::desc("Enable the new, experimental LoopDistribution Pass"),
  111. cl::init(false));
  112. STATISTIC(NumLoopsDistributed, "Number of loops distributed");
  113. namespace {
  114. /// Maintains the set of instructions of the loop for a partition before
  115. /// cloning. After cloning, it hosts the new loop.
  116. class InstPartition {
  117. using InstructionSet = SmallPtrSet<Instruction *, 8>;
  118. public:
  119. InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
  120. : DepCycle(DepCycle), OrigLoop(L) {
  121. Set.insert(I);
  122. }
  123. /// Returns whether this partition contains a dependence cycle.
  124. bool hasDepCycle() const { return DepCycle; }
  125. /// Adds an instruction to this partition.
  126. void add(Instruction *I) { Set.insert(I); }
  127. /// Collection accessors.
  128. InstructionSet::iterator begin() { return Set.begin(); }
  129. InstructionSet::iterator end() { return Set.end(); }
  130. InstructionSet::const_iterator begin() const { return Set.begin(); }
  131. InstructionSet::const_iterator end() const { return Set.end(); }
  132. bool empty() const { return Set.empty(); }
  133. /// Moves this partition into \p Other. This partition becomes empty
  134. /// after this.
  135. void moveTo(InstPartition &Other) {
  136. Other.Set.insert(Set.begin(), Set.end());
  137. Set.clear();
  138. Other.DepCycle |= DepCycle;
  139. }
  140. /// Populates the partition with a transitive closure of all the
  141. /// instructions that the seeded instructions dependent on.
  142. void populateUsedSet() {
  143. // FIXME: We currently don't use control-dependence but simply include all
  144. // blocks (possibly empty at the end) and let simplifycfg mostly clean this
  145. // up.
  146. for (auto *B : OrigLoop->getBlocks())
  147. Set.insert(B->getTerminator());
  148. // Follow the use-def chains to form a transitive closure of all the
  149. // instructions that the originally seeded instructions depend on.
  150. SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
  151. while (!Worklist.empty()) {
  152. Instruction *I = Worklist.pop_back_val();
  153. // Insert instructions from the loop that we depend on.
  154. for (Value *V : I->operand_values()) {
  155. auto *I = dyn_cast<Instruction>(V);
  156. if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
  157. Worklist.push_back(I);
  158. }
  159. }
  160. }
  161. /// Clones the original loop.
  162. ///
  163. /// Updates LoopInfo and DominatorTree using the information that block \p
  164. /// LoopDomBB dominates the loop.
  165. Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
  166. unsigned Index, LoopInfo *LI,
  167. DominatorTree *DT) {
  168. ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
  169. VMap, Twine(".ldist") + Twine(Index),
  170. LI, DT, ClonedLoopBlocks);
  171. return ClonedLoop;
  172. }
  173. /// The cloned loop. If this partition is mapped to the original loop,
  174. /// this is null.
  175. const Loop *getClonedLoop() const { return ClonedLoop; }
  176. /// Returns the loop where this partition ends up after distribution.
  177. /// If this partition is mapped to the original loop then use the block from
  178. /// the loop.
  179. Loop *getDistributedLoop() const {
  180. return ClonedLoop ? ClonedLoop : OrigLoop;
  181. }
  182. /// The VMap that is populated by cloning and then used in
  183. /// remapinstruction to remap the cloned instructions.
  184. ValueToValueMapTy &getVMap() { return VMap; }
  185. /// Remaps the cloned instructions using VMap.
  186. void remapInstructions() {
  187. remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
  188. }
  189. /// Based on the set of instructions selected for this partition,
  190. /// removes the unnecessary ones.
  191. void removeUnusedInsts() {
  192. SmallVector<Instruction *, 8> Unused;
  193. for (auto *Block : OrigLoop->getBlocks())
  194. for (auto &Inst : *Block)
  195. if (!Set.count(&Inst)) {
  196. Instruction *NewInst = &Inst;
  197. if (!VMap.empty())
  198. NewInst = cast<Instruction>(VMap[NewInst]);
  199. assert(!isa<BranchInst>(NewInst) &&
  200. "Branches are marked used early on");
  201. Unused.push_back(NewInst);
  202. }
  203. // Delete the instructions backwards, as it has a reduced likelihood of
  204. // having to update as many def-use and use-def chains.
  205. for (auto *Inst : reverse(Unused)) {
  206. if (!Inst->use_empty())
  207. Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
  208. Inst->eraseFromParent();
  209. }
  210. }
  211. void print() const {
  212. if (DepCycle)
  213. dbgs() << " (cycle)\n";
  214. for (auto *I : Set)
  215. // Prefix with the block name.
  216. dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
  217. }
  218. void printBlocks() const {
  219. for (auto *BB : getDistributedLoop()->getBlocks())
  220. dbgs() << *BB;
  221. }
  222. private:
  223. /// Instructions from OrigLoop selected for this partition.
  224. InstructionSet Set;
  225. /// Whether this partition contains a dependence cycle.
  226. bool DepCycle;
  227. /// The original loop.
  228. Loop *OrigLoop;
  229. /// The cloned loop. If this partition is mapped to the original loop,
  230. /// this is null.
  231. Loop *ClonedLoop = nullptr;
  232. /// The blocks of ClonedLoop including the preheader. If this
  233. /// partition is mapped to the original loop, this is empty.
  234. SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
  235. /// These gets populated once the set of instructions have been
  236. /// finalized. If this partition is mapped to the original loop, these are not
  237. /// set.
  238. ValueToValueMapTy VMap;
  239. };
  240. /// Holds the set of Partitions. It populates them, merges them and then
  241. /// clones the loops.
  242. class InstPartitionContainer {
  243. using InstToPartitionIdT = DenseMap<Instruction *, int>;
  244. public:
  245. InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
  246. : L(L), LI(LI), DT(DT) {}
  247. /// Returns the number of partitions.
  248. unsigned getSize() const { return PartitionContainer.size(); }
  249. /// Adds \p Inst into the current partition if that is marked to
  250. /// contain cycles. Otherwise start a new partition for it.
  251. void addToCyclicPartition(Instruction *Inst) {
  252. // If the current partition is non-cyclic. Start a new one.
  253. if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
  254. PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
  255. else
  256. PartitionContainer.back().add(Inst);
  257. }
  258. /// Adds \p Inst into a partition that is not marked to contain
  259. /// dependence cycles.
  260. ///
  261. // Initially we isolate memory instructions into as many partitions as
  262. // possible, then later we may merge them back together.
  263. void addToNewNonCyclicPartition(Instruction *Inst) {
  264. PartitionContainer.emplace_back(Inst, L);
  265. }
  266. /// Merges adjacent non-cyclic partitions.
  267. ///
  268. /// The idea is that we currently only want to isolate the non-vectorizable
  269. /// partition. We could later allow more distribution among these partition
  270. /// too.
  271. void mergeAdjacentNonCyclic() {
  272. mergeAdjacentPartitionsIf(
  273. [](const InstPartition *P) { return !P->hasDepCycle(); });
  274. }
  275. /// If a partition contains only conditional stores, we won't vectorize
  276. /// it. Try to merge it with a previous cyclic partition.
  277. void mergeNonIfConvertible() {
  278. mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
  279. if (Partition->hasDepCycle())
  280. return true;
  281. // Now, check if all stores are conditional in this partition.
  282. bool seenStore = false;
  283. for (auto *Inst : *Partition)
  284. if (isa<StoreInst>(Inst)) {
  285. seenStore = true;
  286. if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
  287. return false;
  288. }
  289. return seenStore;
  290. });
  291. }
  292. /// Merges the partitions according to various heuristics.
  293. void mergeBeforePopulating() {
  294. mergeAdjacentNonCyclic();
  295. if (!DistributeNonIfConvertible)
  296. mergeNonIfConvertible();
  297. }
  298. /// Merges partitions in order to ensure that no loads are duplicated.
  299. ///
  300. /// We can't duplicate loads because that could potentially reorder them.
  301. /// LoopAccessAnalysis provides dependency information with the context that
  302. /// the order of memory operation is preserved.
  303. ///
  304. /// Return if any partitions were merged.
  305. bool mergeToAvoidDuplicatedLoads() {
  306. using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
  307. using ToBeMergedT = EquivalenceClasses<InstPartition *>;
  308. LoadToPartitionT LoadToPartition;
  309. ToBeMergedT ToBeMerged;
  310. // Step through the partitions and create equivalence between partitions
  311. // that contain the same load. Also put partitions in between them in the
  312. // same equivalence class to avoid reordering of memory operations.
  313. for (PartitionContainerT::iterator I = PartitionContainer.begin(),
  314. E = PartitionContainer.end();
  315. I != E; ++I) {
  316. auto *PartI = &*I;
  317. // If a load occurs in two partitions PartI and PartJ, merge all
  318. // partitions (PartI, PartJ] into PartI.
  319. for (Instruction *Inst : *PartI)
  320. if (isa<LoadInst>(Inst)) {
  321. bool NewElt;
  322. LoadToPartitionT::iterator LoadToPart;
  323. std::tie(LoadToPart, NewElt) =
  324. LoadToPartition.insert(std::make_pair(Inst, PartI));
  325. if (!NewElt) {
  326. LLVM_DEBUG(dbgs()
  327. << "Merging partitions due to this load in multiple "
  328. << "partitions: " << PartI << ", " << LoadToPart->second
  329. << "\n"
  330. << *Inst << "\n");
  331. auto PartJ = I;
  332. do {
  333. --PartJ;
  334. ToBeMerged.unionSets(PartI, &*PartJ);
  335. } while (&*PartJ != LoadToPart->second);
  336. }
  337. }
  338. }
  339. if (ToBeMerged.empty())
  340. return false;
  341. // Merge the member of an equivalence class into its class leader. This
  342. // makes the members empty.
  343. for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
  344. I != E; ++I) {
  345. if (!I->isLeader())
  346. continue;
  347. auto PartI = I->getData();
  348. for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
  349. ToBeMerged.member_end())) {
  350. PartJ->moveTo(*PartI);
  351. }
  352. }
  353. // Remove the empty partitions.
  354. PartitionContainer.remove_if(
  355. [](const InstPartition &P) { return P.empty(); });
  356. return true;
  357. }
  358. /// Sets up the mapping between instructions to partitions. If the
  359. /// instruction is duplicated across multiple partitions, set the entry to -1.
  360. void setupPartitionIdOnInstructions() {
  361. int PartitionID = 0;
  362. for (const auto &Partition : PartitionContainer) {
  363. for (Instruction *Inst : Partition) {
  364. bool NewElt;
  365. InstToPartitionIdT::iterator Iter;
  366. std::tie(Iter, NewElt) =
  367. InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
  368. if (!NewElt)
  369. Iter->second = -1;
  370. }
  371. ++PartitionID;
  372. }
  373. }
  374. /// Populates the partition with everything that the seeding
  375. /// instructions require.
  376. void populateUsedSet() {
  377. for (auto &P : PartitionContainer)
  378. P.populateUsedSet();
  379. }
  380. /// This performs the main chunk of the work of cloning the loops for
  381. /// the partitions.
  382. void cloneLoops() {
  383. BasicBlock *OrigPH = L->getLoopPreheader();
  384. // At this point the predecessor of the preheader is either the memcheck
  385. // block or the top part of the original preheader.
  386. BasicBlock *Pred = OrigPH->getSinglePredecessor();
  387. assert(Pred && "Preheader does not have a single predecessor");
  388. BasicBlock *ExitBlock = L->getExitBlock();
  389. assert(ExitBlock && "No single exit block");
  390. Loop *NewLoop;
  391. assert(!PartitionContainer.empty() && "at least two partitions expected");
  392. // We're cloning the preheader along with the loop so we already made sure
  393. // it was empty.
  394. assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
  395. "preheader not empty");
  396. // Preserve the original loop ID for use after the transformation.
  397. MDNode *OrigLoopID = L->getLoopID();
  398. // Create a loop for each partition except the last. Clone the original
  399. // loop before PH along with adding a preheader for the cloned loop. Then
  400. // update PH to point to the newly added preheader.
  401. BasicBlock *TopPH = OrigPH;
  402. unsigned Index = getSize() - 1;
  403. for (auto I = std::next(PartitionContainer.rbegin()),
  404. E = PartitionContainer.rend();
  405. I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
  406. auto *Part = &*I;
  407. NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
  408. Part->getVMap()[ExitBlock] = TopPH;
  409. Part->remapInstructions();
  410. setNewLoopID(OrigLoopID, Part);
  411. }
  412. Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
  413. // Also set a new loop ID for the last loop.
  414. setNewLoopID(OrigLoopID, &PartitionContainer.back());
  415. // Now go in forward order and update the immediate dominator for the
  416. // preheaders with the exiting block of the previous loop. Dominance
  417. // within the loop is updated in cloneLoopWithPreheader.
  418. for (auto Curr = PartitionContainer.cbegin(),
  419. Next = std::next(PartitionContainer.cbegin()),
  420. E = PartitionContainer.cend();
  421. Next != E; ++Curr, ++Next)
  422. DT->changeImmediateDominator(
  423. Next->getDistributedLoop()->getLoopPreheader(),
  424. Curr->getDistributedLoop()->getExitingBlock());
  425. }
  426. /// Removes the dead instructions from the cloned loops.
  427. void removeUnusedInsts() {
  428. for (auto &Partition : PartitionContainer)
  429. Partition.removeUnusedInsts();
  430. }
  431. /// For each memory pointer, it computes the partitionId the pointer is
  432. /// used in.
  433. ///
  434. /// This returns an array of int where the I-th entry corresponds to I-th
  435. /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
  436. /// partitions its entry is set to -1.
  437. SmallVector<int, 8>
  438. computePartitionSetForPointers(const LoopAccessInfo &LAI) {
  439. const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
  440. unsigned N = RtPtrCheck->Pointers.size();
  441. SmallVector<int, 8> PtrToPartitions(N);
  442. for (unsigned I = 0; I < N; ++I) {
  443. Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
  444. auto Instructions =
  445. LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
  446. int &Partition = PtrToPartitions[I];
  447. // First set it to uninitialized.
  448. Partition = -2;
  449. for (Instruction *Inst : Instructions) {
  450. // Note that this could be -1 if Inst is duplicated across multiple
  451. // partitions.
  452. int ThisPartition = this->InstToPartitionId[Inst];
  453. if (Partition == -2)
  454. Partition = ThisPartition;
  455. // -1 means belonging to multiple partitions.
  456. else if (Partition == -1)
  457. break;
  458. else if (Partition != (int)ThisPartition)
  459. Partition = -1;
  460. }
  461. assert(Partition != -2 && "Pointer not belonging to any partition");
  462. }
  463. return PtrToPartitions;
  464. }
  465. void print(raw_ostream &OS) const {
  466. unsigned Index = 0;
  467. for (const auto &P : PartitionContainer) {
  468. OS << "Partition " << Index++ << " (" << &P << "):\n";
  469. P.print();
  470. }
  471. }
  472. void dump() const { print(dbgs()); }
  473. #ifndef NDEBUG
  474. friend raw_ostream &operator<<(raw_ostream &OS,
  475. const InstPartitionContainer &Partitions) {
  476. Partitions.print(OS);
  477. return OS;
  478. }
  479. #endif
  480. void printBlocks() const {
  481. unsigned Index = 0;
  482. for (const auto &P : PartitionContainer) {
  483. dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
  484. P.printBlocks();
  485. }
  486. }
  487. private:
  488. using PartitionContainerT = std::list<InstPartition>;
  489. /// List of partitions.
  490. PartitionContainerT PartitionContainer;
  491. /// Mapping from Instruction to partition Id. If the instruction
  492. /// belongs to multiple partitions the entry contains -1.
  493. InstToPartitionIdT InstToPartitionId;
  494. Loop *L;
  495. LoopInfo *LI;
  496. DominatorTree *DT;
  497. /// The control structure to merge adjacent partitions if both satisfy
  498. /// the \p Predicate.
  499. template <class UnaryPredicate>
  500. void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
  501. InstPartition *PrevMatch = nullptr;
  502. for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
  503. auto DoesMatch = Predicate(&*I);
  504. if (PrevMatch == nullptr && DoesMatch) {
  505. PrevMatch = &*I;
  506. ++I;
  507. } else if (PrevMatch != nullptr && DoesMatch) {
  508. I->moveTo(*PrevMatch);
  509. I = PartitionContainer.erase(I);
  510. } else {
  511. PrevMatch = nullptr;
  512. ++I;
  513. }
  514. }
  515. }
  516. /// Assign new LoopIDs for the partition's cloned loop.
  517. void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
  518. Optional<MDNode *> PartitionID = makeFollowupLoopID(
  519. OrigLoopID,
  520. {LLVMLoopDistributeFollowupAll,
  521. Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
  522. : LLVMLoopDistributeFollowupCoincident});
  523. if (PartitionID.hasValue()) {
  524. Loop *NewLoop = Part->getDistributedLoop();
  525. NewLoop->setLoopID(PartitionID.getValue());
  526. }
  527. }
  528. };
  529. /// For each memory instruction, this class maintains difference of the
  530. /// number of unsafe dependences that start out from this instruction minus
  531. /// those that end here.
  532. ///
  533. /// By traversing the memory instructions in program order and accumulating this
  534. /// number, we know whether any unsafe dependence crosses over a program point.
  535. class MemoryInstructionDependences {
  536. using Dependence = MemoryDepChecker::Dependence;
  537. public:
  538. struct Entry {
  539. Instruction *Inst;
  540. unsigned NumUnsafeDependencesStartOrEnd = 0;
  541. Entry(Instruction *Inst) : Inst(Inst) {}
  542. };
  543. using AccessesType = SmallVector<Entry, 8>;
  544. AccessesType::const_iterator begin() const { return Accesses.begin(); }
  545. AccessesType::const_iterator end() const { return Accesses.end(); }
  546. MemoryInstructionDependences(
  547. const SmallVectorImpl<Instruction *> &Instructions,
  548. const SmallVectorImpl<Dependence> &Dependences) {
  549. Accesses.append(Instructions.begin(), Instructions.end());
  550. LLVM_DEBUG(dbgs() << "Backward dependences:\n");
  551. for (auto &Dep : Dependences)
  552. if (Dep.isPossiblyBackward()) {
  553. // Note that the designations source and destination follow the program
  554. // order, i.e. source is always first. (The direction is given by the
  555. // DepType.)
  556. ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
  557. --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
  558. LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
  559. }
  560. }
  561. private:
  562. AccessesType Accesses;
  563. };
  564. /// The actual class performing the per-loop work.
  565. class LoopDistributeForLoop {
  566. public:
  567. LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
  568. ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
  569. : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
  570. setForced();
  571. }
  572. /// Try to distribute an inner-most loop.
  573. bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
  574. assert(L->empty() && "Only process inner loops.");
  575. LLVM_DEBUG(dbgs() << "\nLDist: In \""
  576. << L->getHeader()->getParent()->getName()
  577. << "\" checking " << *L << "\n");
  578. if (!L->getExitBlock())
  579. return fail("MultipleExitBlocks", "multiple exit blocks");
  580. if (!L->isLoopSimplifyForm())
  581. return fail("NotLoopSimplifyForm",
  582. "loop is not in loop-simplify form");
  583. BasicBlock *PH = L->getLoopPreheader();
  584. // LAA will check that we only have a single exiting block.
  585. LAI = &GetLAA(*L);
  586. // Currently, we only distribute to isolate the part of the loop with
  587. // dependence cycles to enable partial vectorization.
  588. if (LAI->canVectorizeMemory())
  589. return fail("MemOpsCanBeVectorized",
  590. "memory operations are safe for vectorization");
  591. auto *Dependences = LAI->getDepChecker().getDependences();
  592. if (!Dependences || Dependences->empty())
  593. return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
  594. InstPartitionContainer Partitions(L, LI, DT);
  595. // First, go through each memory operation and assign them to consecutive
  596. // partitions (the order of partitions follows program order). Put those
  597. // with unsafe dependences into "cyclic" partition otherwise put each store
  598. // in its own "non-cyclic" partition (we'll merge these later).
  599. //
  600. // Note that a memory operation (e.g. Load2 below) at a program point that
  601. // has an unsafe dependence (Store3->Load1) spanning over it must be
  602. // included in the same cyclic partition as the dependent operations. This
  603. // is to preserve the original program order after distribution. E.g.:
  604. //
  605. // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
  606. // Load1 -. 1 0->1
  607. // Load2 | /Unsafe/ 0 1
  608. // Store3 -' -1 1->0
  609. // Load4 0 0
  610. //
  611. // NumUnsafeDependencesActive > 0 indicates this situation and in this case
  612. // we just keep assigning to the same cyclic partition until
  613. // NumUnsafeDependencesActive reaches 0.
  614. const MemoryDepChecker &DepChecker = LAI->getDepChecker();
  615. MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
  616. *Dependences);
  617. int NumUnsafeDependencesActive = 0;
  618. for (auto &InstDep : MID) {
  619. Instruction *I = InstDep.Inst;
  620. // We update NumUnsafeDependencesActive post-instruction, catch the
  621. // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
  622. if (NumUnsafeDependencesActive ||
  623. InstDep.NumUnsafeDependencesStartOrEnd > 0)
  624. Partitions.addToCyclicPartition(I);
  625. else
  626. Partitions.addToNewNonCyclicPartition(I);
  627. NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
  628. assert(NumUnsafeDependencesActive >= 0 &&
  629. "Negative number of dependences active");
  630. }
  631. // Add partitions for values used outside. These partitions can be out of
  632. // order from the original program order. This is OK because if the
  633. // partition uses a load we will merge this partition with the original
  634. // partition of the load that we set up in the previous loop (see
  635. // mergeToAvoidDuplicatedLoads).
  636. auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
  637. for (auto *Inst : DefsUsedOutside)
  638. Partitions.addToNewNonCyclicPartition(Inst);
  639. LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
  640. if (Partitions.getSize() < 2)
  641. return fail("CantIsolateUnsafeDeps",
  642. "cannot isolate unsafe dependencies");
  643. // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
  644. // should be able to vectorize these together.
  645. Partitions.mergeBeforePopulating();
  646. LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
  647. if (Partitions.getSize() < 2)
  648. return fail("CantIsolateUnsafeDeps",
  649. "cannot isolate unsafe dependencies");
  650. // Now, populate the partitions with non-memory operations.
  651. Partitions.populateUsedSet();
  652. LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
  653. // In order to preserve original lexical order for loads, keep them in the
  654. // partition that we set up in the MemoryInstructionDependences loop.
  655. if (Partitions.mergeToAvoidDuplicatedLoads()) {
  656. LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
  657. << Partitions);
  658. if (Partitions.getSize() < 2)
  659. return fail("CantIsolateUnsafeDeps",
  660. "cannot isolate unsafe dependencies");
  661. }
  662. // Don't distribute the loop if we need too many SCEV run-time checks.
  663. const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
  664. if (Pred.getComplexity() > (IsForced.getValueOr(false)
  665. ? PragmaDistributeSCEVCheckThreshold
  666. : DistributeSCEVCheckThreshold))
  667. return fail("TooManySCEVRuntimeChecks",
  668. "too many SCEV run-time checks needed.\n");
  669. if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
  670. return fail("HeuristicDisabled", "distribution heuristic disabled");
  671. LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
  672. // We're done forming the partitions set up the reverse mapping from
  673. // instructions to partitions.
  674. Partitions.setupPartitionIdOnInstructions();
  675. // To keep things simple have an empty preheader before we version or clone
  676. // the loop. (Also split if this has no predecessor, i.e. entry, because we
  677. // rely on PH having a predecessor.)
  678. if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
  679. SplitBlock(PH, PH->getTerminator(), DT, LI);
  680. // If we need run-time checks, version the loop now.
  681. auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
  682. const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
  683. const auto &AllChecks = RtPtrChecking->getChecks();
  684. auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
  685. RtPtrChecking);
  686. if (!Pred.isAlwaysTrue() || !Checks.empty()) {
  687. MDNode *OrigLoopID = L->getLoopID();
  688. LLVM_DEBUG(dbgs() << "\nPointers:\n");
  689. LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
  690. LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
  691. LVer.setAliasChecks(std::move(Checks));
  692. LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
  693. LVer.versionLoop(DefsUsedOutside);
  694. LVer.annotateLoopWithNoAlias();
  695. // The unversioned loop will not be changed, so we inherit all attributes
  696. // from the original loop, but remove the loop distribution metadata to
  697. // avoid to distribute it again.
  698. MDNode *UnversionedLoopID =
  699. makeFollowupLoopID(OrigLoopID,
  700. {LLVMLoopDistributeFollowupAll,
  701. LLVMLoopDistributeFollowupFallback},
  702. "llvm.loop.distribute.", true)
  703. .getValue();
  704. LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
  705. }
  706. // Create identical copies of the original loop for each partition and hook
  707. // them up sequentially.
  708. Partitions.cloneLoops();
  709. // Now, we remove the instruction from each loop that don't belong to that
  710. // partition.
  711. Partitions.removeUnusedInsts();
  712. LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
  713. LLVM_DEBUG(Partitions.printBlocks());
  714. if (LDistVerify) {
  715. LI->verify(*DT);
  716. assert(DT->verify(DominatorTree::VerificationLevel::Fast));
  717. }
  718. ++NumLoopsDistributed;
  719. // Report the success.
  720. ORE->emit([&]() {
  721. return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
  722. L->getHeader())
  723. << "distributed loop";
  724. });
  725. return true;
  726. }
  727. /// Provide diagnostics then \return with false.
  728. bool fail(StringRef RemarkName, StringRef Message) {
  729. LLVMContext &Ctx = F->getContext();
  730. bool Forced = isForced().getValueOr(false);
  731. LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
  732. // With Rpass-missed report that distribution failed.
  733. ORE->emit([&]() {
  734. return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
  735. L->getStartLoc(), L->getHeader())
  736. << "loop not distributed: use -Rpass-analysis=loop-distribute for "
  737. "more "
  738. "info";
  739. });
  740. // With Rpass-analysis report why. This is on by default if distribution
  741. // was requested explicitly.
  742. ORE->emit(OptimizationRemarkAnalysis(
  743. Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
  744. RemarkName, L->getStartLoc(), L->getHeader())
  745. << "loop not distributed: " << Message);
  746. // Also issue a warning if distribution was requested explicitly but it
  747. // failed.
  748. if (Forced)
  749. Ctx.diagnose(DiagnosticInfoOptimizationFailure(
  750. *F, L->getStartLoc(), "loop not distributed: failed "
  751. "explicitly specified loop distribution"));
  752. return false;
  753. }
  754. /// Return if distribution forced to be enabled/disabled for the loop.
  755. ///
  756. /// If the optional has a value, it indicates whether distribution was forced
  757. /// to be enabled (true) or disabled (false). If the optional has no value
  758. /// distribution was not forced either way.
  759. const Optional<bool> &isForced() const { return IsForced; }
  760. private:
  761. /// Filter out checks between pointers from the same partition.
  762. ///
  763. /// \p PtrToPartition contains the partition number for pointers. Partition
  764. /// number -1 means that the pointer is used in multiple partitions. In this
  765. /// case we can't safely omit the check.
  766. SmallVector<RuntimePointerChecking::PointerCheck, 4>
  767. includeOnlyCrossPartitionChecks(
  768. const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
  769. const SmallVectorImpl<int> &PtrToPartition,
  770. const RuntimePointerChecking *RtPtrChecking) {
  771. SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
  772. copy_if(AllChecks, std::back_inserter(Checks),
  773. [&](const RuntimePointerChecking::PointerCheck &Check) {
  774. for (unsigned PtrIdx1 : Check.first->Members)
  775. for (unsigned PtrIdx2 : Check.second->Members)
  776. // Only include this check if there is a pair of pointers
  777. // that require checking and the pointers fall into
  778. // separate partitions.
  779. //
  780. // (Note that we already know at this point that the two
  781. // pointer groups need checking but it doesn't follow
  782. // that each pair of pointers within the two groups need
  783. // checking as well.
  784. //
  785. // In other words we don't want to include a check just
  786. // because there is a pair of pointers between the two
  787. // pointer groups that require checks and a different
  788. // pair whose pointers fall into different partitions.)
  789. if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
  790. !RuntimePointerChecking::arePointersInSamePartition(
  791. PtrToPartition, PtrIdx1, PtrIdx2))
  792. return true;
  793. return false;
  794. });
  795. return Checks;
  796. }
  797. /// Check whether the loop metadata is forcing distribution to be
  798. /// enabled/disabled.
  799. void setForced() {
  800. Optional<const MDOperand *> Value =
  801. findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
  802. if (!Value)
  803. return;
  804. const MDOperand *Op = *Value;
  805. assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
  806. IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
  807. }
  808. Loop *L;
  809. Function *F;
  810. // Analyses used.
  811. LoopInfo *LI;
  812. const LoopAccessInfo *LAI = nullptr;
  813. DominatorTree *DT;
  814. ScalarEvolution *SE;
  815. OptimizationRemarkEmitter *ORE;
  816. /// Indicates whether distribution is forced to be enabled/disabled for
  817. /// the loop.
  818. ///
  819. /// If the optional has a value, it indicates whether distribution was forced
  820. /// to be enabled (true) or disabled (false). If the optional has no value
  821. /// distribution was not forced either way.
  822. Optional<bool> IsForced;
  823. };
  824. } // end anonymous namespace
  825. /// Shared implementation between new and old PMs.
  826. static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
  827. ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
  828. std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
  829. // Build up a worklist of inner-loops to vectorize. This is necessary as the
  830. // act of distributing a loop creates new loops and can invalidate iterators
  831. // across the loops.
  832. SmallVector<Loop *, 8> Worklist;
  833. for (Loop *TopLevelLoop : *LI)
  834. for (Loop *L : depth_first(TopLevelLoop))
  835. // We only handle inner-most loops.
  836. if (L->empty())
  837. Worklist.push_back(L);
  838. // Now walk the identified inner loops.
  839. bool Changed = false;
  840. for (Loop *L : Worklist) {
  841. LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
  842. // If distribution was forced for the specific loop to be
  843. // enabled/disabled, follow that. Otherwise use the global flag.
  844. if (LDL.isForced().getValueOr(EnableLoopDistribute))
  845. Changed |= LDL.processLoop(GetLAA);
  846. }
  847. // Process each loop nest in the function.
  848. return Changed;
  849. }
  850. namespace {
  851. /// The pass class.
  852. class LoopDistributeLegacy : public FunctionPass {
  853. public:
  854. static char ID;
  855. LoopDistributeLegacy() : FunctionPass(ID) {
  856. // The default is set by the caller.
  857. initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
  858. }
  859. bool runOnFunction(Function &F) override {
  860. if (skipFunction(F))
  861. return false;
  862. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  863. auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
  864. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  865. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  866. auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  867. std::function<const LoopAccessInfo &(Loop &)> GetLAA =
  868. [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
  869. return runImpl(F, LI, DT, SE, ORE, GetLAA);
  870. }
  871. void getAnalysisUsage(AnalysisUsage &AU) const override {
  872. AU.addRequired<ScalarEvolutionWrapperPass>();
  873. AU.addRequired<LoopInfoWrapperPass>();
  874. AU.addPreserved<LoopInfoWrapperPass>();
  875. AU.addRequired<LoopAccessLegacyAnalysis>();
  876. AU.addRequired<DominatorTreeWrapperPass>();
  877. AU.addPreserved<DominatorTreeWrapperPass>();
  878. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  879. AU.addPreserved<GlobalsAAWrapperPass>();
  880. }
  881. };
  882. } // end anonymous namespace
  883. PreservedAnalyses LoopDistributePass::run(Function &F,
  884. FunctionAnalysisManager &AM) {
  885. auto &LI = AM.getResult<LoopAnalysis>(F);
  886. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  887. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  888. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  889. // We don't directly need these analyses but they're required for loop
  890. // analyses so provide them below.
  891. auto &AA = AM.getResult<AAManager>(F);
  892. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  893. auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  894. auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
  895. auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
  896. std::function<const LoopAccessInfo &(Loop &)> GetLAA =
  897. [&](Loop &L) -> const LoopAccessInfo & {
  898. LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, nullptr};
  899. return LAM.getResult<LoopAccessAnalysis>(L, AR);
  900. };
  901. bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
  902. if (!Changed)
  903. return PreservedAnalyses::all();
  904. PreservedAnalyses PA;
  905. PA.preserve<LoopAnalysis>();
  906. PA.preserve<DominatorTreeAnalysis>();
  907. PA.preserve<GlobalsAA>();
  908. return PA;
  909. }
  910. char LoopDistributeLegacy::ID;
  911. static const char ldist_name[] = "Loop Distribution";
  912. INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
  913. false)
  914. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  915. INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
  916. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  917. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  918. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  919. INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
  920. FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }