LoopUnroll.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906
  1. //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities. It does not define any
  11. // actual pass or policy, but provides a single function to perform loop
  12. // unrolling.
  13. //
  14. // The process of unrolling can produce extraneous basic blocks linked with
  15. // unconditional branches. This will be corrected in the future.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/ADT/SmallPtrSet.h"
  19. #include "llvm/ADT/Statistic.h"
  20. #include "llvm/Analysis/AssumptionCache.h"
  21. #include "llvm/Analysis/InstructionSimplify.h"
  22. #include "llvm/Analysis/LoopIterator.h"
  23. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  24. #include "llvm/Analysis/ScalarEvolution.h"
  25. #include "llvm/IR/BasicBlock.h"
  26. #include "llvm/IR/DataLayout.h"
  27. #include "llvm/IR/DebugInfoMetadata.h"
  28. #include "llvm/IR/Dominators.h"
  29. #include "llvm/IR/IntrinsicInst.h"
  30. #include "llvm/IR/LLVMContext.h"
  31. #include "llvm/Support/Debug.h"
  32. #include "llvm/Support/raw_ostream.h"
  33. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  34. #include "llvm/Transforms/Utils/Cloning.h"
  35. #include "llvm/Transforms/Utils/Local.h"
  36. #include "llvm/Transforms/Utils/LoopSimplify.h"
  37. #include "llvm/Transforms/Utils/LoopUtils.h"
  38. #include "llvm/Transforms/Utils/SimplifyIndVar.h"
  39. #include "llvm/Transforms/Utils/UnrollLoop.h"
  40. using namespace llvm;
  41. #define DEBUG_TYPE "loop-unroll"
  42. // TODO: Should these be here or in LoopUnroll?
  43. STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
  44. STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
  45. static cl::opt<bool>
  46. UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
  47. cl::desc("Allow runtime unrolled loops to be unrolled "
  48. "with epilog instead of prolog."));
  49. static cl::opt<bool>
  50. UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
  51. cl::desc("Verify domtree after unrolling"),
  52. #ifdef NDEBUG
  53. cl::init(false)
  54. #else
  55. cl::init(true)
  56. #endif
  57. );
  58. /// Convert the instruction operands from referencing the current values into
  59. /// those specified by VMap.
  60. static inline void remapInstruction(Instruction *I,
  61. ValueToValueMapTy &VMap) {
  62. for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
  63. Value *Op = I->getOperand(op);
  64. // Unwrap arguments of dbg.value intrinsics.
  65. bool Wrapped = false;
  66. if (auto *V = dyn_cast<MetadataAsValue>(Op))
  67. if (auto *Unwrapped = dyn_cast<ValueAsMetadata>(V->getMetadata())) {
  68. Op = Unwrapped->getValue();
  69. Wrapped = true;
  70. }
  71. auto wrap = [&](Value *V) {
  72. auto &C = I->getContext();
  73. return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V;
  74. };
  75. ValueToValueMapTy::iterator It = VMap.find(Op);
  76. if (It != VMap.end())
  77. I->setOperand(op, wrap(It->second));
  78. }
  79. if (PHINode *PN = dyn_cast<PHINode>(I)) {
  80. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
  81. ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
  82. if (It != VMap.end())
  83. PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
  84. }
  85. }
  86. }
  87. /// Folds a basic block into its predecessor if it only has one predecessor, and
  88. /// that predecessor only has one successor.
  89. /// The LoopInfo Analysis that is passed will be kept consistent. If folding is
  90. /// successful references to the containing loop must be removed from
  91. /// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
  92. /// references to the eliminated BB. The argument ForgottenLoops contains a set
  93. /// of loops that have already been forgotten to prevent redundant, expensive
  94. /// calls to ScalarEvolution::forgetLoop. Returns the new combined block.
  95. static BasicBlock *
  96. foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
  97. SmallPtrSetImpl<Loop *> &ForgottenLoops,
  98. DominatorTree *DT) {
  99. // Merge basic blocks into their predecessor if there is only one distinct
  100. // pred, and if there is only one distinct successor of the predecessor, and
  101. // if there are no PHI nodes.
  102. BasicBlock *OnlyPred = BB->getSinglePredecessor();
  103. if (!OnlyPred) return nullptr;
  104. if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
  105. return nullptr;
  106. DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
  107. // Resolve any PHI nodes at the start of the block. They are all
  108. // guaranteed to have exactly one entry if they exist, unless there are
  109. // multiple duplicate (but guaranteed to be equal) entries for the
  110. // incoming edges. This occurs when there are multiple edges from
  111. // OnlyPred to OnlySucc.
  112. FoldSingleEntryPHINodes(BB);
  113. // Delete the unconditional branch from the predecessor...
  114. OnlyPred->getInstList().pop_back();
  115. // Make all PHI nodes that referred to BB now refer to Pred as their
  116. // source...
  117. BB->replaceAllUsesWith(OnlyPred);
  118. // Move all definitions in the successor to the predecessor...
  119. OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
  120. // OldName will be valid until erased.
  121. StringRef OldName = BB->getName();
  122. // Erase the old block and update dominator info.
  123. if (DT)
  124. if (DomTreeNode *DTN = DT->getNode(BB)) {
  125. DomTreeNode *PredDTN = DT->getNode(OnlyPred);
  126. SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
  127. for (auto *DI : Children)
  128. DT->changeImmediateDominator(DI, PredDTN);
  129. DT->eraseNode(BB);
  130. }
  131. // ScalarEvolution holds references to loop exit blocks.
  132. if (SE) {
  133. if (Loop *L = LI->getLoopFor(BB)) {
  134. if (ForgottenLoops.insert(L).second)
  135. SE->forgetLoop(L);
  136. }
  137. }
  138. LI->removeBlock(BB);
  139. // Inherit predecessor's name if it exists...
  140. if (!OldName.empty() && !OnlyPred->hasName())
  141. OnlyPred->setName(OldName);
  142. BB->eraseFromParent();
  143. return OnlyPred;
  144. }
  145. /// Check if unrolling created a situation where we need to insert phi nodes to
  146. /// preserve LCSSA form.
  147. /// \param Blocks is a vector of basic blocks representing unrolled loop.
  148. /// \param L is the outer loop.
  149. /// It's possible that some of the blocks are in L, and some are not. In this
  150. /// case, if there is a use is outside L, and definition is inside L, we need to
  151. /// insert a phi-node, otherwise LCSSA will be broken.
  152. /// The function is just a helper function for llvm::UnrollLoop that returns
  153. /// true if this situation occurs, indicating that LCSSA needs to be fixed.
  154. static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
  155. LoopInfo *LI) {
  156. for (BasicBlock *BB : Blocks) {
  157. if (LI->getLoopFor(BB) == L)
  158. continue;
  159. for (Instruction &I : *BB) {
  160. for (Use &U : I.operands()) {
  161. if (auto Def = dyn_cast<Instruction>(U)) {
  162. Loop *DefLoop = LI->getLoopFor(Def->getParent());
  163. if (!DefLoop)
  164. continue;
  165. if (DefLoop->contains(L))
  166. return true;
  167. }
  168. }
  169. }
  170. }
  171. return false;
  172. }
  173. /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
  174. /// and adds a mapping from the original loop to the new loop to NewLoops.
  175. /// Returns nullptr if no new loop was created and a pointer to the
  176. /// original loop OriginalBB was part of otherwise.
  177. const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
  178. BasicBlock *ClonedBB, LoopInfo *LI,
  179. NewLoopsMap &NewLoops) {
  180. // Figure out which loop New is in.
  181. const Loop *OldLoop = LI->getLoopFor(OriginalBB);
  182. assert(OldLoop && "Should (at least) be in the loop being unrolled!");
  183. Loop *&NewLoop = NewLoops[OldLoop];
  184. if (!NewLoop) {
  185. // Found a new sub-loop.
  186. assert(OriginalBB == OldLoop->getHeader() &&
  187. "Header should be first in RPO");
  188. NewLoop = LI->AllocateLoop();
  189. Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
  190. if (NewLoopParent)
  191. NewLoopParent->addChildLoop(NewLoop);
  192. else
  193. LI->addTopLevelLoop(NewLoop);
  194. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  195. return OldLoop;
  196. } else {
  197. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  198. return nullptr;
  199. }
  200. }
  201. /// The function chooses which type of unroll (epilog or prolog) is more
  202. /// profitabale.
  203. /// Epilog unroll is more profitable when there is PHI that starts from
  204. /// constant. In this case epilog will leave PHI start from constant,
  205. /// but prolog will convert it to non-constant.
  206. ///
  207. /// loop:
  208. /// PN = PHI [I, Latch], [CI, PreHeader]
  209. /// I = foo(PN)
  210. /// ...
  211. ///
  212. /// Epilog unroll case.
  213. /// loop:
  214. /// PN = PHI [I2, Latch], [CI, PreHeader]
  215. /// I1 = foo(PN)
  216. /// I2 = foo(I1)
  217. /// ...
  218. /// Prolog unroll case.
  219. /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
  220. /// loop:
  221. /// PN = PHI [I2, Latch], [NewPN, PreHeader]
  222. /// I1 = foo(PN)
  223. /// I2 = foo(I1)
  224. /// ...
  225. ///
  226. static bool isEpilogProfitable(Loop *L) {
  227. BasicBlock *PreHeader = L->getLoopPreheader();
  228. BasicBlock *Header = L->getHeader();
  229. assert(PreHeader && Header);
  230. for (const PHINode &PN : Header->phis()) {
  231. if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
  232. return true;
  233. }
  234. return false;
  235. }
  236. /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
  237. /// can only fail when the loop's latch block is not terminated by a conditional
  238. /// branch instruction. However, if the trip count (and multiple) are not known,
  239. /// loop unrolling will mostly produce more code that is no faster.
  240. ///
  241. /// TripCount is the upper bound of the iteration on which control exits
  242. /// LatchBlock. Control may exit the loop prior to TripCount iterations either
  243. /// via an early branch in other loop block or via LatchBlock terminator. This
  244. /// is relaxed from the general definition of trip count which is the number of
  245. /// times the loop header executes. Note that UnrollLoop assumes that the loop
  246. /// counter test is in LatchBlock in order to remove unnecesssary instances of
  247. /// the test. If control can exit the loop from the LatchBlock's terminator
  248. /// prior to TripCount iterations, flag PreserveCondBr needs to be set.
  249. ///
  250. /// PreserveCondBr indicates whether the conditional branch of the LatchBlock
  251. /// needs to be preserved. It is needed when we use trip count upper bound to
  252. /// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
  253. /// conditional branch needs to be preserved.
  254. ///
  255. /// Similarly, TripMultiple divides the number of times that the LatchBlock may
  256. /// execute without exiting the loop.
  257. ///
  258. /// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
  259. /// have a runtime (i.e. not compile time constant) trip count. Unrolling these
  260. /// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
  261. /// iterations before branching into the unrolled loop. UnrollLoop will not
  262. /// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
  263. /// AllowExpensiveTripCount is false.
  264. ///
  265. /// If we want to perform PGO-based loop peeling, PeelCount is set to the
  266. /// number of iterations we want to peel off.
  267. ///
  268. /// The LoopInfo Analysis that is passed will be kept consistent.
  269. ///
  270. /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
  271. /// DominatorTree if they are non-null.
  272. LoopUnrollResult llvm::UnrollLoop(
  273. Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime,
  274. bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst,
  275. unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder,
  276. LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
  277. OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) {
  278. BasicBlock *Preheader = L->getLoopPreheader();
  279. if (!Preheader) {
  280. DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
  281. return LoopUnrollResult::Unmodified;
  282. }
  283. BasicBlock *LatchBlock = L->getLoopLatch();
  284. if (!LatchBlock) {
  285. DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
  286. return LoopUnrollResult::Unmodified;
  287. }
  288. // Loops with indirectbr cannot be cloned.
  289. if (!L->isSafeToClone()) {
  290. DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
  291. return LoopUnrollResult::Unmodified;
  292. }
  293. // The current loop unroll pass can only unroll loops with a single latch
  294. // that's a conditional branch exiting the loop.
  295. // FIXME: The implementation can be extended to work with more complicated
  296. // cases, e.g. loops with multiple latches.
  297. BasicBlock *Header = L->getHeader();
  298. BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
  299. if (!BI || BI->isUnconditional()) {
  300. // The loop-rotate pass can be helpful to avoid this in many cases.
  301. DEBUG(dbgs() <<
  302. " Can't unroll; loop not terminated by a conditional branch.\n");
  303. return LoopUnrollResult::Unmodified;
  304. }
  305. auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
  306. return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2));
  307. };
  308. if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
  309. DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
  310. " exiting the loop can be unrolled\n");
  311. return LoopUnrollResult::Unmodified;
  312. }
  313. if (Header->hasAddressTaken()) {
  314. // The loop-rotate pass can be helpful to avoid this in many cases.
  315. DEBUG(dbgs() <<
  316. " Won't unroll loop: address of header block is taken.\n");
  317. return LoopUnrollResult::Unmodified;
  318. }
  319. if (TripCount != 0)
  320. DEBUG(dbgs() << " Trip Count = " << TripCount << "\n");
  321. if (TripMultiple != 1)
  322. DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n");
  323. // Effectively "DCE" unrolled iterations that are beyond the tripcount
  324. // and will never be executed.
  325. if (TripCount != 0 && Count > TripCount)
  326. Count = TripCount;
  327. // Don't enter the unroll code if there is nothing to do.
  328. if (TripCount == 0 && Count < 2 && PeelCount == 0) {
  329. DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
  330. return LoopUnrollResult::Unmodified;
  331. }
  332. assert(Count > 0);
  333. assert(TripMultiple > 0);
  334. assert(TripCount == 0 || TripCount % TripMultiple == 0);
  335. // Are we eliminating the loop control altogether?
  336. bool CompletelyUnroll = Count == TripCount;
  337. SmallVector<BasicBlock *, 4> ExitBlocks;
  338. L->getExitBlocks(ExitBlocks);
  339. std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
  340. // Go through all exits of L and see if there are any phi-nodes there. We just
  341. // conservatively assume that they're inserted to preserve LCSSA form, which
  342. // means that complete unrolling might break this form. We need to either fix
  343. // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
  344. // now we just recompute LCSSA for the outer loop, but it should be possible
  345. // to fix it in-place.
  346. bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
  347. any_of(ExitBlocks, [](const BasicBlock *BB) {
  348. return isa<PHINode>(BB->begin());
  349. });
  350. // We assume a run-time trip count if the compiler cannot
  351. // figure out the loop trip count and the unroll-runtime
  352. // flag is specified.
  353. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
  354. assert((!RuntimeTripCount || !PeelCount) &&
  355. "Did not expect runtime trip-count unrolling "
  356. "and peeling for the same loop");
  357. if (PeelCount) {
  358. bool Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
  359. // Successful peeling may result in a change in the loop preheader/trip
  360. // counts. If we later unroll the loop, we want these to be updated.
  361. if (Peeled) {
  362. BasicBlock *ExitingBlock = L->getExitingBlock();
  363. assert(ExitingBlock && "Loop without exiting block?");
  364. Preheader = L->getLoopPreheader();
  365. TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
  366. TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
  367. }
  368. }
  369. // Loops containing convergent instructions must have a count that divides
  370. // their TripMultiple.
  371. DEBUG(
  372. {
  373. bool HasConvergent = false;
  374. for (auto &BB : L->blocks())
  375. for (auto &I : *BB)
  376. if (auto CS = CallSite(&I))
  377. HasConvergent |= CS.isConvergent();
  378. assert((!HasConvergent || TripMultiple % Count == 0) &&
  379. "Unroll count must divide trip multiple if loop contains a "
  380. "convergent operation.");
  381. });
  382. bool EpilogProfitability =
  383. UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
  384. : isEpilogProfitable(L);
  385. if (RuntimeTripCount && TripMultiple % Count != 0 &&
  386. !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
  387. EpilogProfitability, UnrollRemainder, LI, SE,
  388. DT, AC, PreserveLCSSA)) {
  389. if (Force)
  390. RuntimeTripCount = false;
  391. else {
  392. DEBUG(
  393. dbgs() << "Wont unroll; remainder loop could not be generated"
  394. "when assuming runtime trip count\n");
  395. return LoopUnrollResult::Unmodified;
  396. }
  397. }
  398. // Notify ScalarEvolution that the loop will be substantially changed,
  399. // if not outright eliminated.
  400. if (SE)
  401. SE->forgetLoop(L);
  402. // If we know the trip count, we know the multiple...
  403. unsigned BreakoutTrip = 0;
  404. if (TripCount != 0) {
  405. BreakoutTrip = TripCount % Count;
  406. TripMultiple = 0;
  407. } else {
  408. // Figure out what multiple to use.
  409. BreakoutTrip = TripMultiple =
  410. (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
  411. }
  412. using namespace ore;
  413. // Report the unrolling decision.
  414. if (CompletelyUnroll) {
  415. DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
  416. << " with trip count " << TripCount << "!\n");
  417. if (ORE)
  418. ORE->emit([&]() {
  419. return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
  420. L->getHeader())
  421. << "completely unrolled loop with "
  422. << NV("UnrollCount", TripCount) << " iterations";
  423. });
  424. } else if (PeelCount) {
  425. DEBUG(dbgs() << "PEELING loop %" << Header->getName()
  426. << " with iteration count " << PeelCount << "!\n");
  427. if (ORE)
  428. ORE->emit([&]() {
  429. return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  430. L->getHeader())
  431. << " peeled loop by " << NV("PeelCount", PeelCount)
  432. << " iterations";
  433. });
  434. } else {
  435. auto DiagBuilder = [&]() {
  436. OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
  437. L->getHeader());
  438. return Diag << "unrolled loop by a factor of "
  439. << NV("UnrollCount", Count);
  440. };
  441. DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
  442. << " by " << Count);
  443. if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
  444. DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
  445. if (ORE)
  446. ORE->emit([&]() {
  447. return DiagBuilder() << " with a breakout at trip "
  448. << NV("BreakoutTrip", BreakoutTrip);
  449. });
  450. } else if (TripMultiple != 1) {
  451. DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
  452. if (ORE)
  453. ORE->emit([&]() {
  454. return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
  455. << " trips per branch";
  456. });
  457. } else if (RuntimeTripCount) {
  458. DEBUG(dbgs() << " with run-time trip count");
  459. if (ORE)
  460. ORE->emit(
  461. [&]() { return DiagBuilder() << " with run-time trip count"; });
  462. }
  463. DEBUG(dbgs() << "!\n");
  464. }
  465. bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  466. BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
  467. // For the first iteration of the loop, we should use the precloned values for
  468. // PHI nodes. Insert associations now.
  469. ValueToValueMapTy LastValueMap;
  470. std::vector<PHINode*> OrigPHINode;
  471. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  472. OrigPHINode.push_back(cast<PHINode>(I));
  473. }
  474. std::vector<BasicBlock*> Headers;
  475. std::vector<BasicBlock*> Latches;
  476. Headers.push_back(Header);
  477. Latches.push_back(LatchBlock);
  478. // The current on-the-fly SSA update requires blocks to be processed in
  479. // reverse postorder so that LastValueMap contains the correct value at each
  480. // exit.
  481. LoopBlocksDFS DFS(L);
  482. DFS.perform(LI);
  483. // Stash the DFS iterators before adding blocks to the loop.
  484. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  485. LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
  486. std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
  487. // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
  488. // might break loop-simplified form for these loops (as they, e.g., would
  489. // share the same exit blocks). We'll keep track of loops for which we can
  490. // break this so that later we can re-simplify them.
  491. SmallSetVector<Loop *, 4> LoopsToSimplify;
  492. for (Loop *SubLoop : *L)
  493. LoopsToSimplify.insert(SubLoop);
  494. if (Header->getParent()->isDebugInfoForProfiling())
  495. for (BasicBlock *BB : L->getBlocks())
  496. for (Instruction &I : *BB)
  497. if (!isa<DbgInfoIntrinsic>(&I))
  498. if (const DILocation *DIL = I.getDebugLoc())
  499. I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
  500. for (unsigned It = 1; It != Count; ++It) {
  501. std::vector<BasicBlock*> NewBlocks;
  502. SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
  503. NewLoops[L] = L;
  504. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  505. ValueToValueMapTy VMap;
  506. BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
  507. Header->getParent()->getBasicBlockList().push_back(New);
  508. assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
  509. "Header should not be in a sub-loop");
  510. // Tell LI about New.
  511. const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
  512. if (OldLoop) {
  513. LoopsToSimplify.insert(NewLoops[OldLoop]);
  514. // Forget the old loop, since its inputs may have changed.
  515. if (SE)
  516. SE->forgetLoop(OldLoop);
  517. }
  518. if (*BB == Header)
  519. // Loop over all of the PHI nodes in the block, changing them to use
  520. // the incoming values from the previous block.
  521. for (PHINode *OrigPHI : OrigPHINode) {
  522. PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
  523. Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
  524. if (Instruction *InValI = dyn_cast<Instruction>(InVal))
  525. if (It > 1 && L->contains(InValI))
  526. InVal = LastValueMap[InValI];
  527. VMap[OrigPHI] = InVal;
  528. New->getInstList().erase(NewPHI);
  529. }
  530. // Update our running map of newest clones
  531. LastValueMap[*BB] = New;
  532. for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
  533. VI != VE; ++VI)
  534. LastValueMap[VI->first] = VI->second;
  535. // Add phi entries for newly created values to all exit blocks.
  536. for (BasicBlock *Succ : successors(*BB)) {
  537. if (L->contains(Succ))
  538. continue;
  539. for (PHINode &PHI : Succ->phis()) {
  540. Value *Incoming = PHI.getIncomingValueForBlock(*BB);
  541. ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
  542. if (It != LastValueMap.end())
  543. Incoming = It->second;
  544. PHI.addIncoming(Incoming, New);
  545. }
  546. }
  547. // Keep track of new headers and latches as we create them, so that
  548. // we can insert the proper branches later.
  549. if (*BB == Header)
  550. Headers.push_back(New);
  551. if (*BB == LatchBlock)
  552. Latches.push_back(New);
  553. NewBlocks.push_back(New);
  554. UnrolledLoopBlocks.push_back(New);
  555. // Update DomTree: since we just copy the loop body, and each copy has a
  556. // dedicated entry block (copy of the header block), this header's copy
  557. // dominates all copied blocks. That means, dominance relations in the
  558. // copied body are the same as in the original body.
  559. if (DT) {
  560. if (*BB == Header)
  561. DT->addNewBlock(New, Latches[It - 1]);
  562. else {
  563. auto BBDomNode = DT->getNode(*BB);
  564. auto BBIDom = BBDomNode->getIDom();
  565. BasicBlock *OriginalBBIDom = BBIDom->getBlock();
  566. DT->addNewBlock(
  567. New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
  568. }
  569. }
  570. }
  571. // Remap all instructions in the most recent iteration
  572. for (BasicBlock *NewBlock : NewBlocks) {
  573. for (Instruction &I : *NewBlock) {
  574. ::remapInstruction(&I, LastValueMap);
  575. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  576. if (II->getIntrinsicID() == Intrinsic::assume)
  577. AC->registerAssumption(II);
  578. }
  579. }
  580. }
  581. // Loop over the PHI nodes in the original block, setting incoming values.
  582. for (PHINode *PN : OrigPHINode) {
  583. if (CompletelyUnroll) {
  584. PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
  585. Header->getInstList().erase(PN);
  586. }
  587. else if (Count > 1) {
  588. Value *InVal = PN->removeIncomingValue(LatchBlock, false);
  589. // If this value was defined in the loop, take the value defined by the
  590. // last iteration of the loop.
  591. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
  592. if (L->contains(InValI))
  593. InVal = LastValueMap[InVal];
  594. }
  595. assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
  596. PN->addIncoming(InVal, Latches.back());
  597. }
  598. }
  599. // Now that all the basic blocks for the unrolled iterations are in place,
  600. // set up the branches to connect them.
  601. for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
  602. // The original branch was replicated in each unrolled iteration.
  603. BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
  604. // The branch destination.
  605. unsigned j = (i + 1) % e;
  606. BasicBlock *Dest = Headers[j];
  607. bool NeedConditional = true;
  608. if (RuntimeTripCount && j != 0) {
  609. NeedConditional = false;
  610. }
  611. // For a complete unroll, make the last iteration end with a branch
  612. // to the exit block.
  613. if (CompletelyUnroll) {
  614. if (j == 0)
  615. Dest = LoopExit;
  616. // If using trip count upper bound to completely unroll, we need to keep
  617. // the conditional branch except the last one because the loop may exit
  618. // after any iteration.
  619. assert(NeedConditional &&
  620. "NeedCondition cannot be modified by both complete "
  621. "unrolling and runtime unrolling");
  622. NeedConditional = (PreserveCondBr && j && !(PreserveOnlyFirst && i != 0));
  623. } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
  624. // If we know the trip count or a multiple of it, we can safely use an
  625. // unconditional branch for some iterations.
  626. NeedConditional = false;
  627. }
  628. if (NeedConditional) {
  629. // Update the conditional branch's successor for the following
  630. // iteration.
  631. Term->setSuccessor(!ContinueOnTrue, Dest);
  632. } else {
  633. // Remove phi operands at this loop exit
  634. if (Dest != LoopExit) {
  635. BasicBlock *BB = Latches[i];
  636. for (BasicBlock *Succ: successors(BB)) {
  637. if (Succ == Headers[i])
  638. continue;
  639. for (PHINode &Phi : Succ->phis())
  640. Phi.removeIncomingValue(BB, false);
  641. }
  642. }
  643. // Replace the conditional branch with an unconditional one.
  644. BranchInst::Create(Dest, Term);
  645. Term->eraseFromParent();
  646. }
  647. }
  648. // Update dominators of blocks we might reach through exits.
  649. // Immediate dominator of such block might change, because we add more
  650. // routes which can lead to the exit: we can now reach it from the copied
  651. // iterations too.
  652. if (DT && Count > 1) {
  653. for (auto *BB : OriginalLoopBlocks) {
  654. auto *BBDomNode = DT->getNode(BB);
  655. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  656. for (auto *ChildDomNode : BBDomNode->getChildren()) {
  657. auto *ChildBB = ChildDomNode->getBlock();
  658. if (!L->contains(ChildBB))
  659. ChildrenToUpdate.push_back(ChildBB);
  660. }
  661. BasicBlock *NewIDom;
  662. if (BB == LatchBlock) {
  663. // The latch is special because we emit unconditional branches in
  664. // some cases where the original loop contained a conditional branch.
  665. // Since the latch is always at the bottom of the loop, if the latch
  666. // dominated an exit before unrolling, the new dominator of that exit
  667. // must also be a latch. Specifically, the dominator is the first
  668. // latch which ends in a conditional branch, or the last latch if
  669. // there is no such latch.
  670. NewIDom = Latches.back();
  671. for (BasicBlock *IterLatch : Latches) {
  672. TerminatorInst *Term = IterLatch->getTerminator();
  673. if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
  674. NewIDom = IterLatch;
  675. break;
  676. }
  677. }
  678. } else {
  679. // The new idom of the block will be the nearest common dominator
  680. // of all copies of the previous idom. This is equivalent to the
  681. // nearest common dominator of the previous idom and the first latch,
  682. // which dominates all copies of the previous idom.
  683. NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
  684. }
  685. for (auto *ChildBB : ChildrenToUpdate)
  686. DT->changeImmediateDominator(ChildBB, NewIDom);
  687. }
  688. }
  689. if (DT && UnrollVerifyDomtree)
  690. DT->verifyDomTree();
  691. // Merge adjacent basic blocks, if possible.
  692. SmallPtrSet<Loop *, 4> ForgottenLoops;
  693. for (BasicBlock *Latch : Latches) {
  694. BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
  695. if (Term->isUnconditional()) {
  696. BasicBlock *Dest = Term->getSuccessor(0);
  697. if (BasicBlock *Fold =
  698. foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
  699. // Dest has been folded into Fold. Update our worklists accordingly.
  700. std::replace(Latches.begin(), Latches.end(), Dest, Fold);
  701. UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
  702. UnrolledLoopBlocks.end(), Dest),
  703. UnrolledLoopBlocks.end());
  704. }
  705. }
  706. }
  707. // Simplify any new induction variables in the partially unrolled loop.
  708. if (SE && !CompletelyUnroll && Count > 1) {
  709. SmallVector<WeakTrackingVH, 16> DeadInsts;
  710. simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
  711. // Aggressively clean up dead instructions that simplifyLoopIVs already
  712. // identified. Any remaining should be cleaned up below.
  713. while (!DeadInsts.empty())
  714. if (Instruction *Inst =
  715. dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
  716. RecursivelyDeleteTriviallyDeadInstructions(Inst);
  717. }
  718. // At this point, the code is well formed. We now do a quick sweep over the
  719. // inserted code, doing constant propagation and dead code elimination as we
  720. // go.
  721. const DataLayout &DL = Header->getModule()->getDataLayout();
  722. const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
  723. for (BasicBlock *BB : NewLoopBlocks) {
  724. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
  725. Instruction *Inst = &*I++;
  726. if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
  727. if (LI->replacementPreservesLCSSAForm(Inst, V))
  728. Inst->replaceAllUsesWith(V);
  729. if (isInstructionTriviallyDead(Inst))
  730. BB->getInstList().erase(Inst);
  731. }
  732. }
  733. // TODO: after peeling or unrolling, previously loop variant conditions are
  734. // likely to fold to constants, eagerly propagating those here will require
  735. // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
  736. // appropriate.
  737. NumCompletelyUnrolled += CompletelyUnroll;
  738. ++NumUnrolled;
  739. Loop *OuterL = L->getParentLoop();
  740. // Update LoopInfo if the loop is completely removed.
  741. if (CompletelyUnroll)
  742. LI->erase(L);
  743. // After complete unrolling most of the blocks should be contained in OuterL.
  744. // However, some of them might happen to be out of OuterL (e.g. if they
  745. // precede a loop exit). In this case we might need to insert PHI nodes in
  746. // order to preserve LCSSA form.
  747. // We don't need to check this if we already know that we need to fix LCSSA
  748. // form.
  749. // TODO: For now we just recompute LCSSA for the outer loop in this case, but
  750. // it should be possible to fix it in-place.
  751. if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
  752. NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
  753. // If we have a pass and a DominatorTree we should re-simplify impacted loops
  754. // to ensure subsequent analyses can rely on this form. We want to simplify
  755. // at least one layer outside of the loop that was unrolled so that any
  756. // changes to the parent loop exposed by the unrolling are considered.
  757. if (DT) {
  758. if (OuterL) {
  759. // OuterL includes all loops for which we can break loop-simplify, so
  760. // it's sufficient to simplify only it (it'll recursively simplify inner
  761. // loops too).
  762. if (NeedToFixLCSSA) {
  763. // LCSSA must be performed on the outermost affected loop. The unrolled
  764. // loop's last loop latch is guaranteed to be in the outermost loop
  765. // after LoopInfo's been updated by LoopInfo::erase.
  766. Loop *LatchLoop = LI->getLoopFor(Latches.back());
  767. Loop *FixLCSSALoop = OuterL;
  768. if (!FixLCSSALoop->contains(LatchLoop))
  769. while (FixLCSSALoop->getParentLoop() != LatchLoop)
  770. FixLCSSALoop = FixLCSSALoop->getParentLoop();
  771. formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
  772. } else if (PreserveLCSSA) {
  773. assert(OuterL->isLCSSAForm(*DT) &&
  774. "Loops should be in LCSSA form after loop-unroll.");
  775. }
  776. // TODO: That potentially might be compile-time expensive. We should try
  777. // to fix the loop-simplified form incrementally.
  778. simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
  779. } else {
  780. // Simplify loops for which we might've broken loop-simplify form.
  781. for (Loop *SubLoop : LoopsToSimplify)
  782. simplifyLoop(SubLoop, DT, LI, SE, AC, PreserveLCSSA);
  783. }
  784. }
  785. return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
  786. : LoopUnrollResult::PartiallyUnrolled;
  787. }
  788. /// Given an llvm.loop loop id metadata node, returns the loop hint metadata
  789. /// node with the given name (for example, "llvm.loop.unroll.count"). If no
  790. /// such metadata node exists, then nullptr is returned.
  791. MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
  792. // First operand should refer to the loop id itself.
  793. assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
  794. assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  795. for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
  796. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  797. if (!MD)
  798. continue;
  799. MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  800. if (!S)
  801. continue;
  802. if (Name.equals(S->getString()))
  803. return MD;
  804. }
  805. return nullptr;
  806. }