LoopUnroll.cpp 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907
  1. //===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities. It does not define any
  11. // actual pass or policy, but provides a single function to perform loop
  12. // unrolling.
  13. //
  14. // The process of unrolling can produce extraneous basic blocks linked with
  15. // unconditional branches. This will be corrected in the future.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/ADT/SmallPtrSet.h"
  19. #include "llvm/ADT/Statistic.h"
  20. #include "llvm/Analysis/AssumptionCache.h"
  21. #include "llvm/Analysis/InstructionSimplify.h"
  22. #include "llvm/Analysis/LoopIterator.h"
  23. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  24. #include "llvm/Analysis/ScalarEvolution.h"
  25. #include "llvm/Transforms/Utils/Local.h"
  26. #include "llvm/IR/BasicBlock.h"
  27. #include "llvm/IR/DataLayout.h"
  28. #include "llvm/IR/DebugInfoMetadata.h"
  29. #include "llvm/IR/Dominators.h"
  30. #include "llvm/IR/IntrinsicInst.h"
  31. #include "llvm/IR/LLVMContext.h"
  32. #include "llvm/Support/Debug.h"
  33. #include "llvm/Support/raw_ostream.h"
  34. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  35. #include "llvm/Transforms/Utils/Cloning.h"
  36. #include "llvm/Transforms/Utils/LoopSimplify.h"
  37. #include "llvm/Transforms/Utils/LoopUtils.h"
  38. #include "llvm/Transforms/Utils/SimplifyIndVar.h"
  39. #include "llvm/Transforms/Utils/UnrollLoop.h"
  40. using namespace llvm;
  41. #define DEBUG_TYPE "loop-unroll"
  42. // TODO: Should these be here or in LoopUnroll?
  43. STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
  44. STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
  45. static cl::opt<bool>
  46. UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
  47. cl::desc("Allow runtime unrolled loops to be unrolled "
  48. "with epilog instead of prolog."));
  49. static cl::opt<bool>
  50. UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
  51. cl::desc("Verify domtree after unrolling"),
  52. #ifdef NDEBUG
  53. cl::init(false)
  54. #else
  55. cl::init(true)
  56. #endif
  57. );
  58. /// Convert the instruction operands from referencing the current values into
  59. /// those specified by VMap.
  60. void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
  61. for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
  62. Value *Op = I->getOperand(op);
  63. // Unwrap arguments of dbg.value intrinsics.
  64. bool Wrapped = false;
  65. if (auto *V = dyn_cast<MetadataAsValue>(Op))
  66. if (auto *Unwrapped = dyn_cast<ValueAsMetadata>(V->getMetadata())) {
  67. Op = Unwrapped->getValue();
  68. Wrapped = true;
  69. }
  70. auto wrap = [&](Value *V) {
  71. auto &C = I->getContext();
  72. return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V;
  73. };
  74. ValueToValueMapTy::iterator It = VMap.find(Op);
  75. if (It != VMap.end())
  76. I->setOperand(op, wrap(It->second));
  77. }
  78. if (PHINode *PN = dyn_cast<PHINode>(I)) {
  79. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
  80. ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
  81. if (It != VMap.end())
  82. PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
  83. }
  84. }
  85. }
  86. /// Folds a basic block into its predecessor if it only has one predecessor, and
  87. /// that predecessor only has one successor.
  88. /// The LoopInfo Analysis that is passed will be kept consistent.
  89. BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
  90. ScalarEvolution *SE,
  91. DominatorTree *DT) {
  92. // Merge basic blocks into their predecessor if there is only one distinct
  93. // pred, and if there is only one distinct successor of the predecessor, and
  94. // if there are no PHI nodes.
  95. BasicBlock *OnlyPred = BB->getSinglePredecessor();
  96. if (!OnlyPred) return nullptr;
  97. if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
  98. return nullptr;
  99. LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
  100. << OnlyPred->getName() << "\n");
  101. // Resolve any PHI nodes at the start of the block. They are all
  102. // guaranteed to have exactly one entry if they exist, unless there are
  103. // multiple duplicate (but guaranteed to be equal) entries for the
  104. // incoming edges. This occurs when there are multiple edges from
  105. // OnlyPred to OnlySucc.
  106. FoldSingleEntryPHINodes(BB);
  107. // Delete the unconditional branch from the predecessor...
  108. OnlyPred->getInstList().pop_back();
  109. // Make all PHI nodes that referred to BB now refer to Pred as their
  110. // source...
  111. BB->replaceAllUsesWith(OnlyPred);
  112. // Move all definitions in the successor to the predecessor...
  113. OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
  114. // OldName will be valid until erased.
  115. StringRef OldName = BB->getName();
  116. // Erase the old block and update dominator info.
  117. if (DT)
  118. if (DomTreeNode *DTN = DT->getNode(BB)) {
  119. DomTreeNode *PredDTN = DT->getNode(OnlyPred);
  120. SmallVector<DomTreeNode *, 8> Children(DTN->begin(), DTN->end());
  121. for (auto *DI : Children)
  122. DT->changeImmediateDominator(DI, PredDTN);
  123. DT->eraseNode(BB);
  124. }
  125. LI->removeBlock(BB);
  126. // Inherit predecessor's name if it exists...
  127. if (!OldName.empty() && !OnlyPred->hasName())
  128. OnlyPred->setName(OldName);
  129. BB->eraseFromParent();
  130. return OnlyPred;
  131. }
  132. /// Check if unrolling created a situation where we need to insert phi nodes to
  133. /// preserve LCSSA form.
  134. /// \param Blocks is a vector of basic blocks representing unrolled loop.
  135. /// \param L is the outer loop.
  136. /// It's possible that some of the blocks are in L, and some are not. In this
  137. /// case, if there is a use is outside L, and definition is inside L, we need to
  138. /// insert a phi-node, otherwise LCSSA will be broken.
  139. /// The function is just a helper function for llvm::UnrollLoop that returns
  140. /// true if this situation occurs, indicating that LCSSA needs to be fixed.
  141. static bool needToInsertPhisForLCSSA(Loop *L, std::vector<BasicBlock *> Blocks,
  142. LoopInfo *LI) {
  143. for (BasicBlock *BB : Blocks) {
  144. if (LI->getLoopFor(BB) == L)
  145. continue;
  146. for (Instruction &I : *BB) {
  147. for (Use &U : I.operands()) {
  148. if (auto Def = dyn_cast<Instruction>(U)) {
  149. Loop *DefLoop = LI->getLoopFor(Def->getParent());
  150. if (!DefLoop)
  151. continue;
  152. if (DefLoop->contains(L))
  153. return true;
  154. }
  155. }
  156. }
  157. }
  158. return false;
  159. }
  160. /// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
  161. /// and adds a mapping from the original loop to the new loop to NewLoops.
  162. /// Returns nullptr if no new loop was created and a pointer to the
  163. /// original loop OriginalBB was part of otherwise.
  164. const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
  165. BasicBlock *ClonedBB, LoopInfo *LI,
  166. NewLoopsMap &NewLoops) {
  167. // Figure out which loop New is in.
  168. const Loop *OldLoop = LI->getLoopFor(OriginalBB);
  169. assert(OldLoop && "Should (at least) be in the loop being unrolled!");
  170. Loop *&NewLoop = NewLoops[OldLoop];
  171. if (!NewLoop) {
  172. // Found a new sub-loop.
  173. assert(OriginalBB == OldLoop->getHeader() &&
  174. "Header should be first in RPO");
  175. NewLoop = LI->AllocateLoop();
  176. Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
  177. if (NewLoopParent)
  178. NewLoopParent->addChildLoop(NewLoop);
  179. else
  180. LI->addTopLevelLoop(NewLoop);
  181. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  182. return OldLoop;
  183. } else {
  184. NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
  185. return nullptr;
  186. }
  187. }
  188. /// The function chooses which type of unroll (epilog or prolog) is more
  189. /// profitabale.
  190. /// Epilog unroll is more profitable when there is PHI that starts from
  191. /// constant. In this case epilog will leave PHI start from constant,
  192. /// but prolog will convert it to non-constant.
  193. ///
  194. /// loop:
  195. /// PN = PHI [I, Latch], [CI, PreHeader]
  196. /// I = foo(PN)
  197. /// ...
  198. ///
  199. /// Epilog unroll case.
  200. /// loop:
  201. /// PN = PHI [I2, Latch], [CI, PreHeader]
  202. /// I1 = foo(PN)
  203. /// I2 = foo(I1)
  204. /// ...
  205. /// Prolog unroll case.
  206. /// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
  207. /// loop:
  208. /// PN = PHI [I2, Latch], [NewPN, PreHeader]
  209. /// I1 = foo(PN)
  210. /// I2 = foo(I1)
  211. /// ...
  212. ///
  213. static bool isEpilogProfitable(Loop *L) {
  214. BasicBlock *PreHeader = L->getLoopPreheader();
  215. BasicBlock *Header = L->getHeader();
  216. assert(PreHeader && Header);
  217. for (const PHINode &PN : Header->phis()) {
  218. if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
  219. return true;
  220. }
  221. return false;
  222. }
  223. /// Perform some cleanup and simplifications on loops after unrolling. It is
  224. /// useful to simplify the IV's in the new loop, as well as do a quick
  225. /// simplify/dce pass of the instructions.
  226. void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
  227. ScalarEvolution *SE, DominatorTree *DT,
  228. AssumptionCache *AC) {
  229. // Simplify any new induction variables in the partially unrolled loop.
  230. if (SE && SimplifyIVs) {
  231. SmallVector<WeakTrackingVH, 16> DeadInsts;
  232. simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
  233. // Aggressively clean up dead instructions that simplifyLoopIVs already
  234. // identified. Any remaining should be cleaned up below.
  235. while (!DeadInsts.empty())
  236. if (Instruction *Inst =
  237. dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
  238. RecursivelyDeleteTriviallyDeadInstructions(Inst);
  239. }
  240. // At this point, the code is well formed. We now do a quick sweep over the
  241. // inserted code, doing constant propagation and dead code elimination as we
  242. // go.
  243. const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
  244. for (BasicBlock *BB : L->getBlocks()) {
  245. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
  246. Instruction *Inst = &*I++;
  247. if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
  248. if (LI->replacementPreservesLCSSAForm(Inst, V))
  249. Inst->replaceAllUsesWith(V);
  250. if (isInstructionTriviallyDead(Inst))
  251. BB->getInstList().erase(Inst);
  252. }
  253. }
  254. // TODO: after peeling or unrolling, previously loop variant conditions are
  255. // likely to fold to constants, eagerly propagating those here will require
  256. // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
  257. // appropriate.
  258. }
  259. /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
  260. /// can only fail when the loop's latch block is not terminated by a conditional
  261. /// branch instruction. However, if the trip count (and multiple) are not known,
  262. /// loop unrolling will mostly produce more code that is no faster.
  263. ///
  264. /// TripCount is the upper bound of the iteration on which control exits
  265. /// LatchBlock. Control may exit the loop prior to TripCount iterations either
  266. /// via an early branch in other loop block or via LatchBlock terminator. This
  267. /// is relaxed from the general definition of trip count which is the number of
  268. /// times the loop header executes. Note that UnrollLoop assumes that the loop
  269. /// counter test is in LatchBlock in order to remove unnecesssary instances of
  270. /// the test. If control can exit the loop from the LatchBlock's terminator
  271. /// prior to TripCount iterations, flag PreserveCondBr needs to be set.
  272. ///
  273. /// PreserveCondBr indicates whether the conditional branch of the LatchBlock
  274. /// needs to be preserved. It is needed when we use trip count upper bound to
  275. /// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
  276. /// conditional branch needs to be preserved.
  277. ///
  278. /// Similarly, TripMultiple divides the number of times that the LatchBlock may
  279. /// execute without exiting the loop.
  280. ///
  281. /// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
  282. /// have a runtime (i.e. not compile time constant) trip count. Unrolling these
  283. /// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
  284. /// iterations before branching into the unrolled loop. UnrollLoop will not
  285. /// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
  286. /// AllowExpensiveTripCount is false.
  287. ///
  288. /// If we want to perform PGO-based loop peeling, PeelCount is set to the
  289. /// number of iterations we want to peel off.
  290. ///
  291. /// The LoopInfo Analysis that is passed will be kept consistent.
  292. ///
  293. /// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
  294. /// DominatorTree if they are non-null.
  295. ///
  296. /// If RemainderLoop is non-null, it will receive the remainder loop (if
  297. /// required and not fully unrolled).
  298. LoopUnrollResult llvm::UnrollLoop(
  299. Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime,
  300. bool AllowExpensiveTripCount, bool PreserveCondBr, bool PreserveOnlyFirst,
  301. unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder,
  302. LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
  303. OptimizationRemarkEmitter *ORE, bool PreserveLCSSA, Loop **RemainderLoop) {
  304. BasicBlock *Preheader = L->getLoopPreheader();
  305. if (!Preheader) {
  306. LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
  307. return LoopUnrollResult::Unmodified;
  308. }
  309. BasicBlock *LatchBlock = L->getLoopLatch();
  310. if (!LatchBlock) {
  311. LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
  312. return LoopUnrollResult::Unmodified;
  313. }
  314. // Loops with indirectbr cannot be cloned.
  315. if (!L->isSafeToClone()) {
  316. LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
  317. return LoopUnrollResult::Unmodified;
  318. }
  319. // The current loop unroll pass can only unroll loops with a single latch
  320. // that's a conditional branch exiting the loop.
  321. // FIXME: The implementation can be extended to work with more complicated
  322. // cases, e.g. loops with multiple latches.
  323. BasicBlock *Header = L->getHeader();
  324. BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
  325. if (!BI || BI->isUnconditional()) {
  326. // The loop-rotate pass can be helpful to avoid this in many cases.
  327. LLVM_DEBUG(
  328. dbgs()
  329. << " Can't unroll; loop not terminated by a conditional branch.\n");
  330. return LoopUnrollResult::Unmodified;
  331. }
  332. auto CheckSuccessors = [&](unsigned S1, unsigned S2) {
  333. return BI->getSuccessor(S1) == Header && !L->contains(BI->getSuccessor(S2));
  334. };
  335. if (!CheckSuccessors(0, 1) && !CheckSuccessors(1, 0)) {
  336. LLVM_DEBUG(dbgs() << "Can't unroll; only loops with one conditional latch"
  337. " exiting the loop can be unrolled\n");
  338. return LoopUnrollResult::Unmodified;
  339. }
  340. if (Header->hasAddressTaken()) {
  341. // The loop-rotate pass can be helpful to avoid this in many cases.
  342. LLVM_DEBUG(
  343. dbgs() << " Won't unroll loop: address of header block is taken.\n");
  344. return LoopUnrollResult::Unmodified;
  345. }
  346. if (TripCount != 0)
  347. LLVM_DEBUG(dbgs() << " Trip Count = " << TripCount << "\n");
  348. if (TripMultiple != 1)
  349. LLVM_DEBUG(dbgs() << " Trip Multiple = " << TripMultiple << "\n");
  350. // Effectively "DCE" unrolled iterations that are beyond the tripcount
  351. // and will never be executed.
  352. if (TripCount != 0 && Count > TripCount)
  353. Count = TripCount;
  354. // Don't enter the unroll code if there is nothing to do.
  355. if (TripCount == 0 && Count < 2 && PeelCount == 0) {
  356. LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
  357. return LoopUnrollResult::Unmodified;
  358. }
  359. assert(Count > 0);
  360. assert(TripMultiple > 0);
  361. assert(TripCount == 0 || TripCount % TripMultiple == 0);
  362. // Are we eliminating the loop control altogether?
  363. bool CompletelyUnroll = Count == TripCount;
  364. SmallVector<BasicBlock *, 4> ExitBlocks;
  365. L->getExitBlocks(ExitBlocks);
  366. std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
  367. // Go through all exits of L and see if there are any phi-nodes there. We just
  368. // conservatively assume that they're inserted to preserve LCSSA form, which
  369. // means that complete unrolling might break this form. We need to either fix
  370. // it in-place after the transformation, or entirely rebuild LCSSA. TODO: For
  371. // now we just recompute LCSSA for the outer loop, but it should be possible
  372. // to fix it in-place.
  373. bool NeedToFixLCSSA = PreserveLCSSA && CompletelyUnroll &&
  374. any_of(ExitBlocks, [](const BasicBlock *BB) {
  375. return isa<PHINode>(BB->begin());
  376. });
  377. // We assume a run-time trip count if the compiler cannot
  378. // figure out the loop trip count and the unroll-runtime
  379. // flag is specified.
  380. bool RuntimeTripCount = (TripCount == 0 && Count > 0 && AllowRuntime);
  381. assert((!RuntimeTripCount || !PeelCount) &&
  382. "Did not expect runtime trip-count unrolling "
  383. "and peeling for the same loop");
  384. bool Peeled = false;
  385. if (PeelCount) {
  386. Peeled = peelLoop(L, PeelCount, LI, SE, DT, AC, PreserveLCSSA);
  387. // Successful peeling may result in a change in the loop preheader/trip
  388. // counts. If we later unroll the loop, we want these to be updated.
  389. if (Peeled) {
  390. BasicBlock *ExitingBlock = L->getExitingBlock();
  391. assert(ExitingBlock && "Loop without exiting block?");
  392. Preheader = L->getLoopPreheader();
  393. TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
  394. TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
  395. }
  396. }
  397. // Loops containing convergent instructions must have a count that divides
  398. // their TripMultiple.
  399. LLVM_DEBUG(
  400. {
  401. bool HasConvergent = false;
  402. for (auto &BB : L->blocks())
  403. for (auto &I : *BB)
  404. if (auto CS = CallSite(&I))
  405. HasConvergent |= CS.isConvergent();
  406. assert((!HasConvergent || TripMultiple % Count == 0) &&
  407. "Unroll count must divide trip multiple if loop contains a "
  408. "convergent operation.");
  409. });
  410. bool EpilogProfitability =
  411. UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
  412. : isEpilogProfitable(L);
  413. if (RuntimeTripCount && TripMultiple % Count != 0 &&
  414. !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
  415. EpilogProfitability, UnrollRemainder, LI, SE,
  416. DT, AC, PreserveLCSSA, RemainderLoop)) {
  417. if (Force)
  418. RuntimeTripCount = false;
  419. else {
  420. LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
  421. "generated when assuming runtime trip count\n");
  422. return LoopUnrollResult::Unmodified;
  423. }
  424. }
  425. // If we know the trip count, we know the multiple...
  426. unsigned BreakoutTrip = 0;
  427. if (TripCount != 0) {
  428. BreakoutTrip = TripCount % Count;
  429. TripMultiple = 0;
  430. } else {
  431. // Figure out what multiple to use.
  432. BreakoutTrip = TripMultiple =
  433. (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
  434. }
  435. using namespace ore;
  436. // Report the unrolling decision.
  437. if (CompletelyUnroll) {
  438. LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
  439. << " with trip count " << TripCount << "!\n");
  440. if (ORE)
  441. ORE->emit([&]() {
  442. return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
  443. L->getHeader())
  444. << "completely unrolled loop with "
  445. << NV("UnrollCount", TripCount) << " iterations";
  446. });
  447. } else if (PeelCount) {
  448. LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
  449. << " with iteration count " << PeelCount << "!\n");
  450. if (ORE)
  451. ORE->emit([&]() {
  452. return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
  453. L->getHeader())
  454. << " peeled loop by " << NV("PeelCount", PeelCount)
  455. << " iterations";
  456. });
  457. } else {
  458. auto DiagBuilder = [&]() {
  459. OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
  460. L->getHeader());
  461. return Diag << "unrolled loop by a factor of "
  462. << NV("UnrollCount", Count);
  463. };
  464. LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
  465. << Count);
  466. if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
  467. LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
  468. if (ORE)
  469. ORE->emit([&]() {
  470. return DiagBuilder() << " with a breakout at trip "
  471. << NV("BreakoutTrip", BreakoutTrip);
  472. });
  473. } else if (TripMultiple != 1) {
  474. LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
  475. if (ORE)
  476. ORE->emit([&]() {
  477. return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
  478. << " trips per branch";
  479. });
  480. } else if (RuntimeTripCount) {
  481. LLVM_DEBUG(dbgs() << " with run-time trip count");
  482. if (ORE)
  483. ORE->emit(
  484. [&]() { return DiagBuilder() << " with run-time trip count"; });
  485. }
  486. LLVM_DEBUG(dbgs() << "!\n");
  487. }
  488. // We are going to make changes to this loop. SCEV may be keeping cached info
  489. // about it, in particular about backedge taken count. The changes we make
  490. // are guaranteed to invalidate this information for our loop. It is tempting
  491. // to only invalidate the loop being unrolled, but it is incorrect as long as
  492. // all exiting branches from all inner loops have impact on the outer loops,
  493. // and if something changes inside them then any of outer loops may also
  494. // change. When we forget outermost loop, we also forget all contained loops
  495. // and this is what we need here.
  496. if (SE)
  497. SE->forgetTopmostLoop(L);
  498. bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
  499. BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
  500. // For the first iteration of the loop, we should use the precloned values for
  501. // PHI nodes. Insert associations now.
  502. ValueToValueMapTy LastValueMap;
  503. std::vector<PHINode*> OrigPHINode;
  504. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  505. OrigPHINode.push_back(cast<PHINode>(I));
  506. }
  507. std::vector<BasicBlock*> Headers;
  508. std::vector<BasicBlock*> Latches;
  509. Headers.push_back(Header);
  510. Latches.push_back(LatchBlock);
  511. // The current on-the-fly SSA update requires blocks to be processed in
  512. // reverse postorder so that LastValueMap contains the correct value at each
  513. // exit.
  514. LoopBlocksDFS DFS(L);
  515. DFS.perform(LI);
  516. // Stash the DFS iterators before adding blocks to the loop.
  517. LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
  518. LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
  519. std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
  520. // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
  521. // might break loop-simplified form for these loops (as they, e.g., would
  522. // share the same exit blocks). We'll keep track of loops for which we can
  523. // break this so that later we can re-simplify them.
  524. SmallSetVector<Loop *, 4> LoopsToSimplify;
  525. for (Loop *SubLoop : *L)
  526. LoopsToSimplify.insert(SubLoop);
  527. if (Header->getParent()->isDebugInfoForProfiling())
  528. for (BasicBlock *BB : L->getBlocks())
  529. for (Instruction &I : *BB)
  530. if (!isa<DbgInfoIntrinsic>(&I))
  531. if (const DILocation *DIL = I.getDebugLoc())
  532. I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
  533. for (unsigned It = 1; It != Count; ++It) {
  534. std::vector<BasicBlock*> NewBlocks;
  535. SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
  536. NewLoops[L] = L;
  537. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  538. ValueToValueMapTy VMap;
  539. BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
  540. Header->getParent()->getBasicBlockList().push_back(New);
  541. assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
  542. "Header should not be in a sub-loop");
  543. // Tell LI about New.
  544. const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
  545. if (OldLoop)
  546. LoopsToSimplify.insert(NewLoops[OldLoop]);
  547. if (*BB == Header)
  548. // Loop over all of the PHI nodes in the block, changing them to use
  549. // the incoming values from the previous block.
  550. for (PHINode *OrigPHI : OrigPHINode) {
  551. PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
  552. Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
  553. if (Instruction *InValI = dyn_cast<Instruction>(InVal))
  554. if (It > 1 && L->contains(InValI))
  555. InVal = LastValueMap[InValI];
  556. VMap[OrigPHI] = InVal;
  557. New->getInstList().erase(NewPHI);
  558. }
  559. // Update our running map of newest clones
  560. LastValueMap[*BB] = New;
  561. for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
  562. VI != VE; ++VI)
  563. LastValueMap[VI->first] = VI->second;
  564. // Add phi entries for newly created values to all exit blocks.
  565. for (BasicBlock *Succ : successors(*BB)) {
  566. if (L->contains(Succ))
  567. continue;
  568. for (PHINode &PHI : Succ->phis()) {
  569. Value *Incoming = PHI.getIncomingValueForBlock(*BB);
  570. ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
  571. if (It != LastValueMap.end())
  572. Incoming = It->second;
  573. PHI.addIncoming(Incoming, New);
  574. }
  575. }
  576. // Keep track of new headers and latches as we create them, so that
  577. // we can insert the proper branches later.
  578. if (*BB == Header)
  579. Headers.push_back(New);
  580. if (*BB == LatchBlock)
  581. Latches.push_back(New);
  582. NewBlocks.push_back(New);
  583. UnrolledLoopBlocks.push_back(New);
  584. // Update DomTree: since we just copy the loop body, and each copy has a
  585. // dedicated entry block (copy of the header block), this header's copy
  586. // dominates all copied blocks. That means, dominance relations in the
  587. // copied body are the same as in the original body.
  588. if (DT) {
  589. if (*BB == Header)
  590. DT->addNewBlock(New, Latches[It - 1]);
  591. else {
  592. auto BBDomNode = DT->getNode(*BB);
  593. auto BBIDom = BBDomNode->getIDom();
  594. BasicBlock *OriginalBBIDom = BBIDom->getBlock();
  595. DT->addNewBlock(
  596. New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
  597. }
  598. }
  599. }
  600. // Remap all instructions in the most recent iteration
  601. for (BasicBlock *NewBlock : NewBlocks) {
  602. for (Instruction &I : *NewBlock) {
  603. ::remapInstruction(&I, LastValueMap);
  604. if (auto *II = dyn_cast<IntrinsicInst>(&I))
  605. if (II->getIntrinsicID() == Intrinsic::assume)
  606. AC->registerAssumption(II);
  607. }
  608. }
  609. }
  610. // Loop over the PHI nodes in the original block, setting incoming values.
  611. for (PHINode *PN : OrigPHINode) {
  612. if (CompletelyUnroll) {
  613. PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
  614. Header->getInstList().erase(PN);
  615. }
  616. else if (Count > 1) {
  617. Value *InVal = PN->removeIncomingValue(LatchBlock, false);
  618. // If this value was defined in the loop, take the value defined by the
  619. // last iteration of the loop.
  620. if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
  621. if (L->contains(InValI))
  622. InVal = LastValueMap[InVal];
  623. }
  624. assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
  625. PN->addIncoming(InVal, Latches.back());
  626. }
  627. }
  628. // Now that all the basic blocks for the unrolled iterations are in place,
  629. // set up the branches to connect them.
  630. for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
  631. // The original branch was replicated in each unrolled iteration.
  632. BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
  633. // The branch destination.
  634. unsigned j = (i + 1) % e;
  635. BasicBlock *Dest = Headers[j];
  636. bool NeedConditional = true;
  637. if (RuntimeTripCount && j != 0) {
  638. NeedConditional = false;
  639. }
  640. // For a complete unroll, make the last iteration end with a branch
  641. // to the exit block.
  642. if (CompletelyUnroll) {
  643. if (j == 0)
  644. Dest = LoopExit;
  645. // If using trip count upper bound to completely unroll, we need to keep
  646. // the conditional branch except the last one because the loop may exit
  647. // after any iteration.
  648. assert(NeedConditional &&
  649. "NeedCondition cannot be modified by both complete "
  650. "unrolling and runtime unrolling");
  651. NeedConditional = (PreserveCondBr && j && !(PreserveOnlyFirst && i != 0));
  652. } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
  653. // If we know the trip count or a multiple of it, we can safely use an
  654. // unconditional branch for some iterations.
  655. NeedConditional = false;
  656. }
  657. if (NeedConditional) {
  658. // Update the conditional branch's successor for the following
  659. // iteration.
  660. Term->setSuccessor(!ContinueOnTrue, Dest);
  661. } else {
  662. // Remove phi operands at this loop exit
  663. if (Dest != LoopExit) {
  664. BasicBlock *BB = Latches[i];
  665. for (BasicBlock *Succ: successors(BB)) {
  666. if (Succ == Headers[i])
  667. continue;
  668. for (PHINode &Phi : Succ->phis())
  669. Phi.removeIncomingValue(BB, false);
  670. }
  671. }
  672. // Replace the conditional branch with an unconditional one.
  673. BranchInst::Create(Dest, Term);
  674. Term->eraseFromParent();
  675. }
  676. }
  677. // Update dominators of blocks we might reach through exits.
  678. // Immediate dominator of such block might change, because we add more
  679. // routes which can lead to the exit: we can now reach it from the copied
  680. // iterations too.
  681. if (DT && Count > 1) {
  682. for (auto *BB : OriginalLoopBlocks) {
  683. auto *BBDomNode = DT->getNode(BB);
  684. SmallVector<BasicBlock *, 16> ChildrenToUpdate;
  685. for (auto *ChildDomNode : BBDomNode->getChildren()) {
  686. auto *ChildBB = ChildDomNode->getBlock();
  687. if (!L->contains(ChildBB))
  688. ChildrenToUpdate.push_back(ChildBB);
  689. }
  690. BasicBlock *NewIDom;
  691. if (BB == LatchBlock) {
  692. // The latch is special because we emit unconditional branches in
  693. // some cases where the original loop contained a conditional branch.
  694. // Since the latch is always at the bottom of the loop, if the latch
  695. // dominated an exit before unrolling, the new dominator of that exit
  696. // must also be a latch. Specifically, the dominator is the first
  697. // latch which ends in a conditional branch, or the last latch if
  698. // there is no such latch.
  699. NewIDom = Latches.back();
  700. for (BasicBlock *IterLatch : Latches) {
  701. Instruction *Term = IterLatch->getTerminator();
  702. if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
  703. NewIDom = IterLatch;
  704. break;
  705. }
  706. }
  707. } else {
  708. // The new idom of the block will be the nearest common dominator
  709. // of all copies of the previous idom. This is equivalent to the
  710. // nearest common dominator of the previous idom and the first latch,
  711. // which dominates all copies of the previous idom.
  712. NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
  713. }
  714. for (auto *ChildBB : ChildrenToUpdate)
  715. DT->changeImmediateDominator(ChildBB, NewIDom);
  716. }
  717. }
  718. assert(!DT || !UnrollVerifyDomtree ||
  719. DT->verify(DominatorTree::VerificationLevel::Fast));
  720. // Merge adjacent basic blocks, if possible.
  721. for (BasicBlock *Latch : Latches) {
  722. BranchInst *Term = cast<BranchInst>(Latch->getTerminator());
  723. if (Term->isUnconditional()) {
  724. BasicBlock *Dest = Term->getSuccessor(0);
  725. if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
  726. // Dest has been folded into Fold. Update our worklists accordingly.
  727. std::replace(Latches.begin(), Latches.end(), Dest, Fold);
  728. UnrolledLoopBlocks.erase(std::remove(UnrolledLoopBlocks.begin(),
  729. UnrolledLoopBlocks.end(), Dest),
  730. UnrolledLoopBlocks.end());
  731. }
  732. }
  733. }
  734. // At this point, the code is well formed. We now simplify the unrolled loop,
  735. // doing constant propagation and dead code elimination as we go.
  736. simplifyLoopAfterUnroll(L, !CompletelyUnroll && (Count > 1 || Peeled), LI, SE,
  737. DT, AC);
  738. NumCompletelyUnrolled += CompletelyUnroll;
  739. ++NumUnrolled;
  740. Loop *OuterL = L->getParentLoop();
  741. // Update LoopInfo if the loop is completely removed.
  742. if (CompletelyUnroll)
  743. LI->erase(L);
  744. // After complete unrolling most of the blocks should be contained in OuterL.
  745. // However, some of them might happen to be out of OuterL (e.g. if they
  746. // precede a loop exit). In this case we might need to insert PHI nodes in
  747. // order to preserve LCSSA form.
  748. // We don't need to check this if we already know that we need to fix LCSSA
  749. // form.
  750. // TODO: For now we just recompute LCSSA for the outer loop in this case, but
  751. // it should be possible to fix it in-place.
  752. if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
  753. NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
  754. // If we have a pass and a DominatorTree we should re-simplify impacted loops
  755. // to ensure subsequent analyses can rely on this form. We want to simplify
  756. // at least one layer outside of the loop that was unrolled so that any
  757. // changes to the parent loop exposed by the unrolling are considered.
  758. if (DT) {
  759. if (OuterL) {
  760. // OuterL includes all loops for which we can break loop-simplify, so
  761. // it's sufficient to simplify only it (it'll recursively simplify inner
  762. // loops too).
  763. if (NeedToFixLCSSA) {
  764. // LCSSA must be performed on the outermost affected loop. The unrolled
  765. // loop's last loop latch is guaranteed to be in the outermost loop
  766. // after LoopInfo's been updated by LoopInfo::erase.
  767. Loop *LatchLoop = LI->getLoopFor(Latches.back());
  768. Loop *FixLCSSALoop = OuterL;
  769. if (!FixLCSSALoop->contains(LatchLoop))
  770. while (FixLCSSALoop->getParentLoop() != LatchLoop)
  771. FixLCSSALoop = FixLCSSALoop->getParentLoop();
  772. formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
  773. } else if (PreserveLCSSA) {
  774. assert(OuterL->isLCSSAForm(*DT) &&
  775. "Loops should be in LCSSA form after loop-unroll.");
  776. }
  777. // TODO: That potentially might be compile-time expensive. We should try
  778. // to fix the loop-simplified form incrementally.
  779. simplifyLoop(OuterL, DT, LI, SE, AC, PreserveLCSSA);
  780. } else {
  781. // Simplify loops for which we might've broken loop-simplify form.
  782. for (Loop *SubLoop : LoopsToSimplify)
  783. simplifyLoop(SubLoop, DT, LI, SE, AC, PreserveLCSSA);
  784. }
  785. }
  786. return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
  787. : LoopUnrollResult::PartiallyUnrolled;
  788. }
  789. /// Given an llvm.loop loop id metadata node, returns the loop hint metadata
  790. /// node with the given name (for example, "llvm.loop.unroll.count"). If no
  791. /// such metadata node exists, then nullptr is returned.
  792. MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
  793. // First operand should refer to the loop id itself.
  794. assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
  795. assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
  796. for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
  797. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  798. if (!MD)
  799. continue;
  800. MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  801. if (!S)
  802. continue;
  803. if (Name.equals(S->getString()))
  804. return MD;
  805. }
  806. return nullptr;
  807. }