LoopUnroll.cpp 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891
  1. //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities. It does not define any
  11. // actual pass or policy, but provides a single function to perform loop
  12. // unrolling.
  13. //
  14. // The process of unrolling can produce extraneous basic blocks linked with
  15. // unconditional branches. This will be corrected in the future.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/ADT/SmallPtrSet.h"
  19. #include "llvm/ADT/Statistic.h"
  20. #include "llvm/Analysis/AssumptionCache.h"
  21. #include "llvm/Analysis/InstructionSimplify.h"
  22. #include "llvm/Analysis/LoopIterator.h"
  23. #include "llvm/Analysis/LoopPass.h"
  24. #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
  25. #include "llvm/Analysis/ScalarEvolution.h"
  26. #include "llvm/IR/BasicBlock.h"
  27. #include "llvm/IR/DataLayout.h"
  28. #include "llvm/IR/DebugInfoMetadata.h"
  29. #include "llvm/IR/Dominators.h"
  30. #include "llvm/IR/IntrinsicInst.h"
  31. #include "llvm/IR/LLVMContext.h"
  32. #include "llvm/Support/Debug.h"
  33. #include "llvm/Support/raw_ostream.h"
  34. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  35. #include "llvm/Transforms/Utils/Cloning.h"
  36. #include "llvm/Transforms/Utils/Local.h"
  37. #include "llvm/Transforms/Utils/LoopSimplify.h"
  38. #include "llvm/Transforms/Utils/LoopUtils.h"
  39. #include "llvm/Transforms/Utils/SimplifyIndVar.h"
  40. #include "llvm/Transforms/Utils/UnrollLoop.h"
  41. using namespace llvm;
  42. #define DEBUG_TYPE "loop-unroll"
  43. // TODO: Should these be here or in LoopUnroll?
  44. STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
  45. STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
  46. static cl::opt<bool>
  47. UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
  48. cl::desc("Allow runtime unrolled loops to be unrolled "
  49. "with epilog instead of prolog."));
  50. static cl::opt<bool>
  51. UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
  52. cl::desc("Verify domtree after unrolling"),
  53. #ifdef NDEBUG
  54. cl::init(false)
  55. #else
  56. cl::init(true)
  57. #endif
  58. );
  59. /// Convert the instruction operands from referencing the current values into
  60. /// those specified by VMap.
  61. static inline void remapInstruction(Instruction *I,
  62. ValueToValueMapTy &VMap) {
  63. for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
  64. Value *Op = I->getOperand(op);
  65. ValueToValueMapTy::iterator It = VMap.find(Op);
  66. if (It != VMap.end())
  67. I->setOperand(op, It->second);
  68. }
  69. if (PHINode *PN = dyn_cast<PHINode>(I)) {
  70. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
  71. ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
  72. if (It != VMap.end())
  73. PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
  74. }
  75. }
  76. }
  77. /// Folds a basic block into its predecessor if it only has one predecessor, and
  78. /// that predecessor only has one successor.
  79. /// The LoopInfo Analysis that is passed will be kept consistent. If folding is
  80. /// successful references to the containing loop must be removed from
  81. /// ScalarEvolution by calling ScalarEvolution::forgetLoop because SE may have
  82. /// references to the eliminated BB. The argument ForgottenLoops contains a set
  83. /// of loops that have already been forgotten to prevent redundant, expensive
  84. /// calls to ScalarEvolution::forgetLoop. Returns the new combined block.
  85. static BasicBlock *
  86. foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
  87. SmallPtrSetImpl<Loop *> &ForgottenLoops,
  88. DominatorTree *DT) {
  89. // Merge basic blocks into their predecessor if there is only one distinct
  90. // pred, and if there is only one distinct successor of the predecessor, and
  91. // if there are no PHI nodes.
  92. BasicBlock *OnlyPred = BB->getSinglePredecessor();
  93. if (!OnlyPred) return nullptr;
  94. if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
  95. return nullptr;
  96. DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
  97. // Resolve any PHI nodes at the start of the block. They are all
  98. // guaranteed to have exactly one entry if they exist, unless there are
  99. // multiple duplicate (but guaranteed to be equal) entries for the
  100. // incoming edges. This occurs when there are multiple edges from
  101. // OnlyPred to OnlySucc.
  102. FoldSingleEntryPHINodes(BB);
  103. // Delete the unconditional branch from the predecessor...
  104. OnlyPred->getInstList().pop_back();
  105. // Make all PHI nodes that referred to BB now refer to Pred as their
  106. // source...
  107. BB->replaceAllUsesWith(OnlyPred);
  108. // Move all definitions in the successor to the predecessor...
  109. OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
  110. // OldName will be valid until erased.
  111. StringRef OldName = BB->getName();
  112. // Erase the old block and update dominator info.
  113. if (DT)
  114. if (DomTreeNode *DTN = DT->getNode(BB)) {
  115. DomTreeNode *PredDTN = DT->getNode(OnlyPred);
  116. SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
  117. for (auto *DI : Children)
  118. DT->changeImmediateDominator(DI, PredDTN);
  119. DT->eraseNode(BB);
  120. }
  121. // ScalarEvolution holds references to loop exit blocks.
  122. if (SE) {
  123. if (Loop *L = LI->getLoopFor(BB)) {
  124. if (ForgottenLoops.insert(L).second)
  125. SE->forgetLoop(L);
  126. }
  127. }
  128. LI->removeBlock(BB);
  129. // Inherit predecessor's name if it exists...
  130. if (!OldName.empty() && !OnlyPred->hasName())
  131. OnlyPred->setName(OldName);
  132. BB->eraseFromParent();
  133. return OnlyPred;
  134. }
  135. /// Check if unrolling created a situation where we need to insert phi nodes to
  136. /// preserve LCSSA form.
  137. /// \param Blocks is a vector of basic blocks representing unrolled loop.
  138. /// \param L is the outer loop.
  139. /// It's possible that some of the blocks are in L, and some are not. In this
  140. /// case, if there is a use is outside L, and definition is inside L, we need to
  141. /// insert a phi-node, otherwise LCSSA will be broken.
  142. /// The function is just a helper function for llvm::UnrollLoop that returns
  143. /// true if this situation occurs, indicating that LCSSA needs to be fixed.
  144. static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
  145. LoopInfo *LI) {
  146. for (BasicBlock *BB : Blocks) {
  147. if (LI->getLoopFor(BB) == L)
  148. continue;
  149. for (Instruction &I : *BB) {
  150. for (Use &U : I.operands()) {
  151. if (auto Def = dyn_cast<Instruction>(U)) {
  152. Loop *DefLoop = LI->getLoopFor(Def->getParent());
  153. if (!DefLoop)
  154. continue;
  155. if (DefLoop->contains(L))
  156. return true;
  157. }
  158. }
  159. }
  160. }
  161. return false;
  162. }
  163. /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
  164. /// and adds a mapping from the original loop to the new loop to NewLoops.
  165. /// Returns nullptr if no new loop was created and a pointer to the
  166. /// original loop OriginalBB was part of otherwise.
  167. const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
  168. BasicBlock *ClonedBB, LoopInfo *LI,
  169. NewLoopsMap &NewLoops) {
  170. // Figure out which loop New is in.
  171. const Loop *OldLoop = LI->getLoopFor(OriginalBB);
  172. assert(OldLoop && "Should (at least) be in the loop being unrolled!");
  173. Loop *&NewLoop = NewLoops[OldLoop];
  174. if (!NewLoop) {
  175. // Found a new sub-loop.
  176. assert(OriginalBB == OldLoop->getHeader() &&
  177. "Header should be first in RPO");
  178. NewLoop = new Loop();
  179. Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
  180. if (NewLoopParent)
  181. NewLoopParent->addChildLoop(NewLoop);
  182. else
  183. LI->addTopLevelLoop(NewLoop);
  184. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  185. return OldLoop;
  186. } else {
  187. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  188. return nullptr;
  189. }
  190. }
  191. /// The function chooses which type of unroll (epilog or prolog) is more
  192. /// profitabale.
  193. /// Epilog unroll is more profitable when there is PHI that starts from
  194. /// constant. In this case epilog will leave PHI start from constant,
  195. /// but prolog will convert it to non-constant.
  196. ///
  197. /// loop:
  198. /// PN = PHI [I, Latch], [CI, PreHeader]
  199. /// I = foo(PN)
  200. /// ...
  201. ///
  202. /// Epilog unroll case.
  203. /// loop:
  204. /// PN = PHI [I2, Latch], [CI, PreHeader]
  205. /// I1 = foo(PN)
  206. /// I2 = foo(I1)
  207. /// ...
  208. /// Prolog unroll case.
  209. /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
  210. /// loop:
  211. /// PN = PHI [I2, Latch], [NewPN, PreHeader]
  212. /// I1 = foo(PN)
  213. /// I2 = foo(I1)
  214. /// ...
  215. ///
  216. static bool isEpilogProfitable(Loop *L) {
  217. BasicBlock *PreHeader = L->getLoopPreheader();
  218. BasicBlock *Header = L->getHeader();
  219. assert(PreHeader && Header);
  220. for (Instruction &BBI : *Header) {
  221. PHINode *PN = dyn_cast<PHINode>(&BBI);
  222. if (!PN)
  223. break;
  224. if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
  225. return true;
  226. }
  227. return false;
  228. }
  229. /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
  230. /// if unrolling was successful, or false if the loop was unmodified. Unrolling
  231. /// can only fail when the loop's latch block is not terminated by a conditional
  232. /// branch instruction. However, if the trip count (and multiple) are not known,
  233. /// loop unrolling will mostly produce more code that is no faster.
  234. ///
  235. /// TripCount is the upper bound of the iteration on which control exits
  236. /// LatchBlock. Control may exit the loop prior to TripCount iterations either
  237. /// via an early branch in other loop block or via LatchBlock terminator. This
  238. /// is relaxed from the general definition of trip count which is the number of
  239. /// times the loop header executes. Note that UnrollLoop assumes that the loop
  240. /// counter test is in LatchBlock in order to remove unnecesssary instances of
  241. /// the test. If control can exit the loop from the LatchBlock's terminator
  242. /// prior to TripCount iterations, flag PreserveCondBr needs to be set.
  243. ///
  244. /// PreserveCondBr indicates whether the conditional branch of the LatchBlock
  245. /// needs to be preserved. It is needed when we use trip count upper bound to
  246. /// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
  247. /// conditional branch needs to be preserved.
  248. ///
  249. /// Similarly, TripMultiple divides the number of times that the LatchBlock may
  250. /// execute without exiting the loop.
  251. ///
  252. /// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
  253. /// have a runtime (i.e. not compile time constant) trip count. Unrolling these
  254. /// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
  255. /// iterations before branching into the unrolled loop. UnrollLoop will not
  256. /// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
  257. /// AllowExpensiveTripCount is false.
  258. ///
  259. /// If we want to perform PGO-based loop peeling, PeelCount is set to the
  260. /// number of iterations we want to peel off.
  261. ///
  262. /// The LoopInfo Analysis that is passed will be kept consistent.
  263. ///
  264. /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
  265. /// DominatorTree if they are non-null.
  266. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
  267. bool AllowRuntime, bool AllowExpensiveTripCount,
  268. bool PreserveCondBr, bool PreserveOnlyFirst,
  269. unsigned TripMultiple, unsigned PeelCount,
  270. bool UnrollRemainder, LoopInfo *LI,
  271. ScalarEvolution *SE, DominatorTree *DT,
  272. AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
  273. bool PreserveLCSSA) {
  274. BasicBlock *Preheader = L->getLoopPreheader();
  275. if (!Preheader) {
  276. DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
  277. return false;
  278. }
  279. BasicBlock *LatchBlock = L->getLoopLatch();
  280. if (!LatchBlock) {
  281. DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
  282. return false;
  283. }
  284. // Loops with indirectbr cannot be cloned.
  285. if (!L->isSafeToClone()) {
  286. DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
  287. return false;
  288. }
  289. // The current loop unroll pass can only unroll loops with a single latch
  290. // that's a conditional branch exiting the loop.
  291. // FIXME: The implementation can be extended to work with more complicated
  292. // cases, e.g. loops with multiple latches.
  293. BasicBlock *Header = L->getHeader();
  294. BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
  295. if (!BI || BI->isUnconditional()) {
  296. // The loop-rotate pass can be helpful to avoid this in many cases.
  297. DEBUG(dbgs() <<
  298. " Can't unroll; loop not terminated by a conditional branch.\n");
  299. return false;
  300. }
  301. auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
  302. return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2));
  303. };
  304. if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
  305. DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
  306. " exiting the loop can be unrolled\n");
  307. return false;
  308. }
  309. if (Header->hasAddressTaken()) {
  310. // The loop-rotate pass can be helpful to avoid this in many cases.
  311. DEBUG(dbgs() <<
  312. " Won't unroll loop: address of header block is taken.\n");
  313. return false;
  314. }
  315. if (TripCount != 0)
  316. DEBUG(dbgs() << " Trip Count = " << TripCount << "\n");
  317. if (TripMultiple != 1)
  318. DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n");
  319. // Effectively "DCE" unrolled iterations that are beyond the tripcount
  320. // and will never be executed.
  321. if (TripCount != 0 && Count > TripCount)
  322. Count = TripCount;
  323. // Don't enter the unroll code if there is nothing to do.
  324. if (TripCount == 0 && Count < 2 && PeelCount == 0) {
  325. DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
  326. return false;
  327. }
  328. assert(Count > 0);
  329. assert(TripMultiple > 0);
  330. assert(TripCount == 0 || TripCount % TripMultiple == 0);
  331. // Are we eliminating the loop control altogether?
  332. bool CompletelyUnroll = Count == TripCount;
  333. SmallVector<BasicBlock *, 4> ExitBlocks;
  334. L->getExitBlocks(ExitBlocks);
  335. std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
  336. // Go through all exits of L and see if there are any phi-nodes there. We just
  337. // conservatively assume that they're inserted to preserve LCSSA form, which
  338. // means that complete unrolling might break this form. We need to either fix
  339. // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
  340. // now we just recompute LCSSA for the outer loop, but it should be possible
  341. // to fix it in-place.
  342. bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
  343. any_of(ExitBlocks, [](const BasicBlock *BB) {
  344. return isa<PHINode>(BB->begin());
  345. });
  346. // We assume a run-time trip count if the compiler cannot
  347. // figure out the loop trip count and the unroll-runtime
  348. // flag is specified.
  349. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
  350. assert((!RuntimeTripCount || !PeelCount) &&
  351. "Did not expect runtime trip-count unrolling "
  352. "and peeling for the same loop");
  353. if (PeelCount) {
  354. bool Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
  355. // Successful peeling may result in a change in the loop preheader/trip
  356. // counts. If we later unroll the loop, we want these to be updated.
  357. if (Peeled) {
  358. BasicBlock *ExitingBlock = L->getExitingBlock();
  359. assert(ExitingBlock && "Loop without exiting block?");
  360. Preheader = L->getLoopPreheader();
  361. TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
  362. TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
  363. }
  364. }
  365. // Loops containing convergent instructions must have a count that divides
  366. // their TripMultiple.
  367. DEBUG(
  368. {
  369. bool HasConvergent = false;
  370. for (auto &BB : L->blocks())
  371. for (auto &I : *BB)
  372. if (auto CS = CallSite(&I))
  373. HasConvergent |= CS.isConvergent();
  374. assert((!HasConvergent || TripMultiple % Count == 0) &&
  375. "Unroll count must divide trip multiple if loop contains a "
  376. "convergent operation.");
  377. });
  378. bool EpilogProfitability =
  379. UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
  380. : isEpilogProfitable(L);
  381. if (RuntimeTripCount && TripMultiple % Count != 0 &&
  382. !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
  383. EpilogProfitability, UnrollRemainder,
  384. LI, SE, DT, AC, ORE,
  385. PreserveLCSSA)) {
  386. if (Force)
  387. RuntimeTripCount = false;
  388. else {
  389. DEBUG(
  390. dbgs() << "Wont unroll; remainder loop could not be generated"
  391. "when assuming runtime trip count\n");
  392. return false;
  393. }
  394. }
  395. // Notify ScalarEvolution that the loop will be substantially changed,
  396. // if not outright eliminated.
  397. if (SE)
  398. SE->forgetLoop(L);
  399. // If we know the trip count, we know the multiple...
  400. unsigned BreakoutTrip = 0;
  401. if (TripCount != 0) {
  402. BreakoutTrip = TripCount % Count;
  403. TripMultiple = 0;
  404. } else {
  405. // Figure out what multiple to use.
  406. BreakoutTrip = TripMultiple =
  407. (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
  408. }
  409. using namespace ore;
  410. // Report the unrolling decision.
  411. if (CompletelyUnroll) {
  412. DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
  413. << " with trip count " << TripCount << "!\n");
  414. ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
  415. L->getHeader())
  416. << "completely unrolled loop with "
  417. << NV("UnrollCount", TripCount) << " iterations");
  418. } else if (PeelCount) {
  419. DEBUG(dbgs() << "PEELING loop %" << Header->getName()
  420. << " with iteration count " << PeelCount << "!\n");
  421. ORE->emit(OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  422. L->getHeader())
  423. << " peeled loop by " << NV("PeelCount", PeelCount)
  424. << " iterations");
  425. } else {
  426. auto DiagBuilder = [&]() {
  427. OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
  428. L->getHeader());
  429. return Diag << "unrolled loop by a factor of "
  430. << NV("UnrollCount", Count);
  431. };
  432. DEBUG(dbgs() << "UNROLLING loop %" << Header->getName()
  433. << " by " << Count);
  434. if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
  435. DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
  436. ORE->emit([&]() {
  437. return DiagBuilder() << " with a breakout at trip "
  438. << NV("BreakoutTrip", BreakoutTrip);
  439. });
  440. } else if (TripMultiple != 1) {
  441. DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
  442. ORE->emit([&]() {
  443. return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
  444. << " trips per branch";
  445. });
  446. } else if (RuntimeTripCount) {
  447. DEBUG(dbgs() << " with run-time trip count");
  448. ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
  449. }
  450. DEBUG(dbgs() << "!\n");
  451. }
  452. bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  453. BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
  454. // For the first iteration of the loop, we should use the precloned values for
  455. // PHI nodes. Insert associations now.
  456. ValueToValueMapTy LastValueMap;
  457. std::vector<PHINode*> OrigPHINode;
  458. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  459. OrigPHINode.push_back(cast<PHINode>(I));
  460. }
  461. std::vector<BasicBlock*> Headers;
  462. std::vector<BasicBlock*> Latches;
  463. Headers.push_back(Header);
  464. Latches.push_back(LatchBlock);
  465. // The current on-the-fly SSA update requires blocks to be processed in
  466. // reverse postorder so that LastValueMap contains the correct value at each
  467. // exit.
  468. LoopBlocksDFS DFS(L);
  469. DFS.perform(LI);
  470. // Stash the DFS iterators before adding blocks to the loop.
  471. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  472. LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
  473. std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
  474. // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
  475. // might break loop-simplified form for these loops (as they, e.g., would
  476. // share the same exit blocks). We'll keep track of loops for which we can
  477. // break this so that later we can re-simplify them.
  478. SmallSetVector<Loop *, 4> LoopsToSimplify;
  479. for (Loop *SubLoop : *L)
  480. LoopsToSimplify.insert(SubLoop);
  481. if (Header->getParent()->isDebugInfoForProfiling())
  482. for (BasicBlock *BB : L->getBlocks())
  483. for (Instruction &I : *BB)
  484. if (const DILocation *DIL = I.getDebugLoc())
  485. I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
  486. for (unsigned It = 1; It != Count; ++It) {
  487. std::vector<BasicBlock*> NewBlocks;
  488. SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
  489. NewLoops[L] = L;
  490. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  491. ValueToValueMapTy VMap;
  492. BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
  493. Header->getParent()->getBasicBlockList().push_back(New);
  494. assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
  495. "Header should not be in a sub-loop");
  496. // Tell LI about New.
  497. const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
  498. if (OldLoop) {
  499. LoopsToSimplify.insert(NewLoops[OldLoop]);
  500. // Forget the old loop, since its inputs may have changed.
  501. if (SE)
  502. SE->forgetLoop(OldLoop);
  503. }
  504. if (*BB == Header)
  505. // Loop over all of the PHI nodes in the block, changing them to use
  506. // the incoming values from the previous block.
  507. for (PHINode *OrigPHI : OrigPHINode) {
  508. PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
  509. Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
  510. if (Instruction *InValI = dyn_cast<Instruction>(InVal))
  511. if (It > 1 && L->contains(InValI))
  512. InVal = LastValueMap[InValI];
  513. VMap[OrigPHI] = InVal;
  514. New->getInstList().erase(NewPHI);
  515. }
  516. // Update our running map of newest clones
  517. LastValueMap[*BB] = New;
  518. for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
  519. VI != VE; ++VI)
  520. LastValueMap[VI->first] = VI->second;
  521. // Add phi entries for newly created values to all exit blocks.
  522. for (BasicBlock *Succ : successors(*BB)) {
  523. if (L->contains(Succ))
  524. continue;
  525. for (BasicBlock::iterator BBI = Succ->begin();
  526. PHINode *phi = dyn_cast<PHINode>(BBI); ++BBI) {
  527. Value *Incoming = phi->getIncomingValueForBlock(*BB);
  528. ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
  529. if (It != LastValueMap.end())
  530. Incoming = It->second;
  531. phi->addIncoming(Incoming, New);
  532. }
  533. }
  534. // Keep track of new headers and latches as we create them, so that
  535. // we can insert the proper branches later.
  536. if (*BB == Header)
  537. Headers.push_back(New);
  538. if (*BB == LatchBlock)
  539. Latches.push_back(New);
  540. NewBlocks.push_back(New);
  541. UnrolledLoopBlocks.push_back(New);
  542. // Update DomTree: since we just copy the loop body, and each copy has a
  543. // dedicated entry block (copy of the header block), this header's copy
  544. // dominates all copied blocks. That means, dominance relations in the
  545. // copied body are the same as in the original body.
  546. if (DT) {
  547. if (*BB == Header)
  548. DT->addNewBlock(New, Latches[It - 1]);
  549. else {
  550. auto BBDomNode = DT->getNode(*BB);
  551. auto BBIDom = BBDomNode->getIDom();
  552. BasicBlock *OriginalBBIDom = BBIDom->getBlock();
  553. DT->addNewBlock(
  554. New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
  555. }
  556. }
  557. }
  558. // Remap all instructions in the most recent iteration
  559. for (BasicBlock *NewBlock : NewBlocks) {
  560. for (Instruction &I : *NewBlock) {
  561. ::remapInstruction(&I, LastValueMap);
  562. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  563. if (II->getIntrinsicID() == Intrinsic::assume)
  564. AC->registerAssumption(II);
  565. }
  566. }
  567. }
  568. // Loop over the PHI nodes in the original block, setting incoming values.
  569. for (PHINode *PN : OrigPHINode) {
  570. if (CompletelyUnroll) {
  571. PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
  572. Header->getInstList().erase(PN);
  573. }
  574. else if (Count > 1) {
  575. Value *InVal = PN->removeIncomingValue(LatchBlock, false);
  576. // If this value was defined in the loop, take the value defined by the
  577. // last iteration of the loop.
  578. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
  579. if (L->contains(InValI))
  580. InVal = LastValueMap[InVal];
  581. }
  582. assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
  583. PN->addIncoming(InVal, Latches.back());
  584. }
  585. }
  586. // Now that all the basic blocks for the unrolled iterations are in place,
  587. // set up the branches to connect them.
  588. for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
  589. // The original branch was replicated in each unrolled iteration.
  590. BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
  591. // The branch destination.
  592. unsigned j = (i + 1) % e;
  593. BasicBlock *Dest = Headers[j];
  594. bool NeedConditional = true;
  595. if (RuntimeTripCount && j != 0) {
  596. NeedConditional = false;
  597. }
  598. // For a complete unroll, make the last iteration end with a branch
  599. // to the exit block.
  600. if (CompletelyUnroll) {
  601. if (j == 0)
  602. Dest = LoopExit;
  603. // If using trip count upper bound to completely unroll, we need to keep
  604. // the conditional branch except the last one because the loop may exit
  605. // after any iteration.
  606. assert(NeedConditional &&
  607. "NeedCondition cannot be modified by both complete "
  608. "unrolling and runtime unrolling");
  609. NeedConditional = (PreserveCondBr && j && !(PreserveOnlyFirst && i != 0));
  610. } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
  611. // If we know the trip count or a multiple of it, we can safely use an
  612. // unconditional branch for some iterations.
  613. NeedConditional = false;
  614. }
  615. if (NeedConditional) {
  616. // Update the conditional branch's successor for the following
  617. // iteration.
  618. Term->setSuccessor(!ContinueOnTrue, Dest);
  619. } else {
  620. // Remove phi operands at this loop exit
  621. if (Dest != LoopExit) {
  622. BasicBlock *BB = Latches[i];
  623. for (BasicBlock *Succ: successors(BB)) {
  624. if (Succ == Headers[i])
  625. continue;
  626. for (BasicBlock::iterator BBI = Succ->begin();
  627. PHINode *Phi = dyn_cast<PHINode>(BBI); ++BBI) {
  628. Phi->removeIncomingValue(BB, false);
  629. }
  630. }
  631. }
  632. // Replace the conditional branch with an unconditional one.
  633. BranchInst::Create(Dest, Term);
  634. Term->eraseFromParent();
  635. }
  636. }
  637. // Update dominators of blocks we might reach through exits.
  638. // Immediate dominator of such block might change, because we add more
  639. // routes which can lead to the exit: we can now reach it from the copied
  640. // iterations too.
  641. if (DT && Count > 1) {
  642. for (auto *BB : OriginalLoopBlocks) {
  643. auto *BBDomNode = DT->getNode(BB);
  644. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  645. for (auto *ChildDomNode : BBDomNode->getChildren()) {
  646. auto *ChildBB = ChildDomNode->getBlock();
  647. if (!L->contains(ChildBB))
  648. ChildrenToUpdate.push_back(ChildBB);
  649. }
  650. BasicBlock *NewIDom;
  651. if (BB == LatchBlock) {
  652. // The latch is special because we emit unconditional branches in
  653. // some cases where the original loop contained a conditional branch.
  654. // Since the latch is always at the bottom of the loop, if the latch
  655. // dominated an exit before unrolling, the new dominator of that exit
  656. // must also be a latch. Specifically, the dominator is the first
  657. // latch which ends in a conditional branch, or the last latch if
  658. // there is no such latch.
  659. NewIDom = Latches.back();
  660. for (BasicBlock *IterLatch : Latches) {
  661. TerminatorInst *Term = IterLatch->getTerminator();
  662. if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
  663. NewIDom = IterLatch;
  664. break;
  665. }
  666. }
  667. } else {
  668. // The new idom of the block will be the nearest common dominator
  669. // of all copies of the previous idom. This is equivalent to the
  670. // nearest common dominator of the previous idom and the first latch,
  671. // which dominates all copies of the previous idom.
  672. NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
  673. }
  674. for (auto *ChildBB : ChildrenToUpdate)
  675. DT->changeImmediateDominator(ChildBB, NewIDom);
  676. }
  677. }
  678. if (DT && UnrollVerifyDomtree)
  679. DT->verifyDomTree();
  680. // Merge adjacent basic blocks, if possible.
  681. SmallPtrSet<Loop *, 4> ForgottenLoops;
  682. for (BasicBlock *Latch : Latches) {
  683. BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
  684. if (Term->isUnconditional()) {
  685. BasicBlock *Dest = Term->getSuccessor(0);
  686. if (BasicBlock *Fold =
  687. foldBlockIntoPredecessor(Dest, LI, SE, ForgottenLoops, DT)) {
  688. // Dest has been folded into Fold. Update our worklists accordingly.
  689. std::replace(Latches.begin(), Latches.end(), Dest, Fold);
  690. UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
  691. UnrolledLoopBlocks.end(), Dest),
  692. UnrolledLoopBlocks.end());
  693. }
  694. }
  695. }
  696. // Simplify any new induction variables in the partially unrolled loop.
  697. if (SE && !CompletelyUnroll && Count > 1) {
  698. SmallVector<WeakTrackingVH, 16> DeadInsts;
  699. simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
  700. // Aggressively clean up dead instructions that simplifyLoopIVs already
  701. // identified. Any remaining should be cleaned up below.
  702. while (!DeadInsts.empty())
  703. if (Instruction *Inst =
  704. dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
  705. RecursivelyDeleteTriviallyDeadInstructions(Inst);
  706. }
  707. // At this point, the code is well formed. We now do a quick sweep over the
  708. // inserted code, doing constant propagation and dead code elimination as we
  709. // go.
  710. const DataLayout &DL = Header->getModule()->getDataLayout();
  711. const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
  712. for (BasicBlock *BB : NewLoopBlocks) {
  713. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
  714. Instruction *Inst = &*I++;
  715. if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
  716. if (LI->replacementPreservesLCSSAForm(Inst, V))
  717. Inst->replaceAllUsesWith(V);
  718. if (isInstructionTriviallyDead(Inst))
  719. BB->getInstList().erase(Inst);
  720. }
  721. }
  722. // TODO: after peeling or unrolling, previously loop variant conditions are
  723. // likely to fold to constants, eagerly propagating those here will require
  724. // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
  725. // appropriate.
  726. NumCompletelyUnrolled += CompletelyUnroll;
  727. ++NumUnrolled;
  728. Loop *OuterL = L->getParentLoop();
  729. // Update LoopInfo if the loop is completely removed.
  730. if (CompletelyUnroll)
  731. LI->markAsRemoved(L);
  732. // After complete unrolling most of the blocks should be contained in OuterL.
  733. // However, some of them might happen to be out of OuterL (e.g. if they
  734. // precede a loop exit). In this case we might need to insert PHI nodes in
  735. // order to preserve LCSSA form.
  736. // We don't need to check this if we already know that we need to fix LCSSA
  737. // form.
  738. // TODO: For now we just recompute LCSSA for the outer loop in this case, but
  739. // it should be possible to fix it in-place.
  740. if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
  741. NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
  742. // If we have a pass and a DominatorTree we should re-simplify impacted loops
  743. // to ensure subsequent analyses can rely on this form. We want to simplify
  744. // at least one layer outside of the loop that was unrolled so that any
  745. // changes to the parent loop exposed by the unrolling are considered.
  746. if (DT) {
  747. if (OuterL) {
  748. // OuterL includes all loops for which we can break loop-simplify, so
  749. // it's sufficient to simplify only it (it'll recursively simplify inner
  750. // loops too).
  751. if (NeedToFixLCSSA) {
  752. // LCSSA must be performed on the outermost affected loop. The unrolled
  753. // loop's last loop latch is guaranteed to be in the outermost loop
  754. // after LoopInfo's been updated by markAsRemoved.
  755. Loop *LatchLoop = LI->getLoopFor(Latches.back());
  756. Loop *FixLCSSALoop = OuterL;
  757. if (!FixLCSSALoop->contains(LatchLoop))
  758. while (FixLCSSALoop->getParentLoop() != LatchLoop)
  759. FixLCSSALoop = FixLCSSALoop->getParentLoop();
  760. formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
  761. } else if (PreserveLCSSA) {
  762. assert(OuterL->isLCSSAForm(*DT) &&
  763. "Loops should be in LCSSA form after loop-unroll.");
  764. }
  765. // TODO: That potentially might be compile-time expensive. We should try
  766. // to fix the loop-simplified form incrementally.
  767. simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
  768. } else {
  769. // Simplify loops for which we might've broken loop-simplify form.
  770. for (Loop *SubLoop : LoopsToSimplify)
  771. simplifyLoop(SubLoop, DT, LI, SE, AC, PreserveLCSSA);
  772. }
  773. }
  774. return true;
  775. }
  776. /// Given an llvm.loop loop id metadata node, returns the loop hint metadata
  777. /// node with the given name (for example, "llvm.loop.unroll.count"). If no
  778. /// such metadata node exists, then nullptr is returned.
  779. MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
  780. // First operand should refer to the loop id itself.
  781. assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
  782. assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  783. for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
  784. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  785. if (!MD)
  786. continue;
  787. MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  788. if (!S)
  789. continue;
  790. if (Name.equals(S->getString()))
  791. return MD;
  792. }
  793. return nullptr;
  794. }