LoopUnrollRuntime.cpp 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873
  1. //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities for loops with run-time
  11. // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
  12. // trip counts.
  13. //
  14. // The functions in this file are used to generate extra code when the
  15. // run-time trip count modulo the unroll factor is not 0. When this is the
  16. // case, we need to generate code to execute these 'left over' iterations.
  17. //
  18. // The current strategy generates an if-then-else sequence prior to the
  19. // unrolled loop to execute the 'left over' iterations before or after the
  20. // unrolled loop.
  21. //
  22. //===----------------------------------------------------------------------===//
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/ADT/SmallSet.h"
  25. #include "llvm/Analysis/AliasAnalysis.h"
  26. #include "llvm/Analysis/LoopIterator.h"
  27. #include "llvm/Analysis/LoopPass.h"
  28. #include "llvm/Analysis/ScalarEvolution.h"
  29. #include "llvm/Analysis/ScalarEvolutionExpander.h"
  30. #include "llvm/IR/BasicBlock.h"
  31. #include "llvm/IR/Dominators.h"
  32. #include "llvm/IR/Metadata.h"
  33. #include "llvm/IR/Module.h"
  34. #include "llvm/Support/Debug.h"
  35. #include "llvm/Support/raw_ostream.h"
  36. #include "llvm/Transforms/Scalar.h"
  37. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  38. #include "llvm/Transforms/Utils/Cloning.h"
  39. #include "llvm/Transforms/Utils/LoopUtils.h"
  40. #include "llvm/Transforms/Utils/UnrollLoop.h"
  41. #include <algorithm>
  42. using namespace llvm;
  43. #define DEBUG_TYPE "loop-unroll"
  44. STATISTIC(NumRuntimeUnrolled,
  45. "Number of loops unrolled with run-time trip counts");
  46. static cl::opt<bool> UnrollRuntimeMultiExit(
  47. "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
  48. cl::desc("Allow runtime unrolling for loops with multiple exits, when "
  49. "epilog is generated"));
  50. /// Connect the unrolling prolog code to the original loop.
  51. /// The unrolling prolog code contains code to execute the
  52. /// 'extra' iterations if the run-time trip count modulo the
  53. /// unroll count is non-zero.
  54. ///
  55. /// This function performs the following:
  56. /// - Create PHI nodes at prolog end block to combine values
  57. /// that exit the prolog code and jump around the prolog.
  58. /// - Add a PHI operand to a PHI node at the loop exit block
  59. /// for values that exit the prolog and go around the loop.
  60. /// - Branch around the original loop if the trip count is less
  61. /// than the unroll factor.
  62. ///
  63. static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  64. BasicBlock *PrologExit,
  65. BasicBlock *OriginalLoopLatchExit,
  66. BasicBlock *PreHeader, BasicBlock *NewPreHeader,
  67. ValueToValueMapTy &VMap, DominatorTree *DT,
  68. LoopInfo *LI, bool PreserveLCSSA) {
  69. BasicBlock *Latch = L->getLoopLatch();
  70. assert(Latch && "Loop must have a latch");
  71. BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
  72. // Create a PHI node for each outgoing value from the original loop
  73. // (which means it is an outgoing value from the prolog code too).
  74. // The new PHI node is inserted in the prolog end basic block.
  75. // The new PHI node value is added as an operand of a PHI node in either
  76. // the loop header or the loop exit block.
  77. for (BasicBlock *Succ : successors(Latch)) {
  78. for (Instruction &BBI : *Succ) {
  79. PHINode *PN = dyn_cast<PHINode>(&BBI);
  80. // Exit when we passed all PHI nodes.
  81. if (!PN)
  82. break;
  83. // Add a new PHI node to the prolog end block and add the
  84. // appropriate incoming values.
  85. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
  86. PrologExit->getFirstNonPHI());
  87. // Adding a value to the new PHI node from the original loop preheader.
  88. // This is the value that skips all the prolog code.
  89. if (L->contains(PN)) {
  90. NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
  91. PreHeader);
  92. } else {
  93. NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
  94. }
  95. Value *V = PN->getIncomingValueForBlock(Latch);
  96. if (Instruction *I = dyn_cast<Instruction>(V)) {
  97. if (L->contains(I)) {
  98. V = VMap.lookup(I);
  99. }
  100. }
  101. // Adding a value to the new PHI node from the last prolog block
  102. // that was created.
  103. NewPN->addIncoming(V, PrologLatch);
  104. // Update the existing PHI node operand with the value from the
  105. // new PHI node. How this is done depends on if the existing
  106. // PHI node is in the original loop block, or the exit block.
  107. if (L->contains(PN)) {
  108. PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
  109. } else {
  110. PN->addIncoming(NewPN, PrologExit);
  111. }
  112. }
  113. }
  114. // Make sure that created prolog loop is in simplified form
  115. SmallVector<BasicBlock *, 4> PrologExitPreds;
  116. Loop *PrologLoop = LI->getLoopFor(PrologLatch);
  117. if (PrologLoop) {
  118. for (BasicBlock *PredBB : predecessors(PrologExit))
  119. if (PrologLoop->contains(PredBB))
  120. PrologExitPreds.push_back(PredBB);
  121. SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
  122. PreserveLCSSA);
  123. }
  124. // Create a branch around the original loop, which is taken if there are no
  125. // iterations remaining to be executed after running the prologue.
  126. Instruction *InsertPt = PrologExit->getTerminator();
  127. IRBuilder<> B(InsertPt);
  128. assert(Count != 0 && "nonsensical Count!");
  129. // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
  130. // This means %xtraiter is (BECount + 1) and all of the iterations of this
  131. // loop were executed by the prologue. Note that if BECount <u (Count - 1)
  132. // then (BECount + 1) cannot unsigned-overflow.
  133. Value *BrLoopExit =
  134. B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
  135. // Split the exit to maintain loop canonicalization guarantees
  136. SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
  137. SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
  138. PreserveLCSSA);
  139. // Add the branch to the exit block (around the unrolled loop)
  140. B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
  141. InsertPt->eraseFromParent();
  142. if (DT)
  143. DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
  144. }
  145. /// Connect the unrolling epilog code to the original loop.
  146. /// The unrolling epilog code contains code to execute the
  147. /// 'extra' iterations if the run-time trip count modulo the
  148. /// unroll count is non-zero.
  149. ///
  150. /// This function performs the following:
  151. /// - Update PHI nodes at the unrolling loop exit and epilog loop exit
  152. /// - Create PHI nodes at the unrolling loop exit to combine
  153. /// values that exit the unrolling loop code and jump around it.
  154. /// - Update PHI operands in the epilog loop by the new PHI nodes
  155. /// - Branch around the epilog loop if extra iters (ModVal) is zero.
  156. ///
  157. static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
  158. BasicBlock *Exit, BasicBlock *PreHeader,
  159. BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
  160. ValueToValueMapTy &VMap, DominatorTree *DT,
  161. LoopInfo *LI, bool PreserveLCSSA) {
  162. BasicBlock *Latch = L->getLoopLatch();
  163. assert(Latch && "Loop must have a latch");
  164. BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
  165. // Loop structure should be the following:
  166. //
  167. // PreHeader
  168. // NewPreHeader
  169. // Header
  170. // ...
  171. // Latch
  172. // NewExit (PN)
  173. // EpilogPreHeader
  174. // EpilogHeader
  175. // ...
  176. // EpilogLatch
  177. // Exit (EpilogPN)
  178. // Update PHI nodes at NewExit and Exit.
  179. for (Instruction &BBI : *NewExit) {
  180. PHINode *PN = dyn_cast<PHINode>(&BBI);
  181. // Exit when we passed all PHI nodes.
  182. if (!PN)
  183. break;
  184. // PN should be used in another PHI located in Exit block as
  185. // Exit was split by SplitBlockPredecessors into Exit and NewExit
  186. // Basicaly it should look like:
  187. // NewExit:
  188. // PN = PHI [I, Latch]
  189. // ...
  190. // Exit:
  191. // EpilogPN = PHI [PN, EpilogPreHeader]
  192. //
  193. // There is EpilogPreHeader incoming block instead of NewExit as
  194. // NewExit was spilt 1 more time to get EpilogPreHeader.
  195. assert(PN->hasOneUse() && "The phi should have 1 use");
  196. PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
  197. assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
  198. // Add incoming PreHeader from branch around the Loop
  199. PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
  200. Value *V = PN->getIncomingValueForBlock(Latch);
  201. Instruction *I = dyn_cast<Instruction>(V);
  202. if (I && L->contains(I))
  203. // If value comes from an instruction in the loop add VMap value.
  204. V = VMap.lookup(I);
  205. // For the instruction out of the loop, constant or undefined value
  206. // insert value itself.
  207. EpilogPN->addIncoming(V, EpilogLatch);
  208. assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
  209. "EpilogPN should have EpilogPreHeader incoming block");
  210. // Change EpilogPreHeader incoming block to NewExit.
  211. EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
  212. NewExit);
  213. // Now PHIs should look like:
  214. // NewExit:
  215. // PN = PHI [I, Latch], [undef, PreHeader]
  216. // ...
  217. // Exit:
  218. // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
  219. }
  220. // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
  221. // Update corresponding PHI nodes in epilog loop.
  222. for (BasicBlock *Succ : successors(Latch)) {
  223. // Skip this as we already updated phis in exit blocks.
  224. if (!L->contains(Succ))
  225. continue;
  226. for (Instruction &BBI : *Succ) {
  227. PHINode *PN = dyn_cast<PHINode>(&BBI);
  228. // Exit when we passed all PHI nodes.
  229. if (!PN)
  230. break;
  231. // Add new PHI nodes to the loop exit block and update epilog
  232. // PHIs with the new PHI values.
  233. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
  234. NewExit->getFirstNonPHI());
  235. // Adding a value to the new PHI node from the unrolling loop preheader.
  236. NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
  237. // Adding a value to the new PHI node from the unrolling loop latch.
  238. NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
  239. // Update the existing PHI node operand with the value from the new PHI
  240. // node. Corresponding instruction in epilog loop should be PHI.
  241. PHINode *VPN = cast<PHINode>(VMap[&BBI]);
  242. VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
  243. }
  244. }
  245. Instruction *InsertPt = NewExit->getTerminator();
  246. IRBuilder<> B(InsertPt);
  247. Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
  248. assert(Exit && "Loop must have a single exit block only");
  249. // Split the epilogue exit to maintain loop canonicalization guarantees
  250. SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
  251. SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
  252. PreserveLCSSA);
  253. // Add the branch to the exit block (around the unrolling loop)
  254. B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
  255. InsertPt->eraseFromParent();
  256. if (DT)
  257. DT->changeImmediateDominator(Exit, NewExit);
  258. // Split the main loop exit to maintain canonicalization guarantees.
  259. SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
  260. SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI,
  261. PreserveLCSSA);
  262. }
  263. /// Create a clone of the blocks in a loop and connect them together.
  264. /// If CreateRemainderLoop is false, loop structure will not be cloned,
  265. /// otherwise a new loop will be created including all cloned blocks, and the
  266. /// iterator of it switches to count NewIter down to 0.
  267. /// The cloned blocks should be inserted between InsertTop and InsertBot.
  268. /// If loop structure is cloned InsertTop should be new preheader, InsertBot
  269. /// new loop exit.
  270. /// Return the new cloned loop that is created when CreateRemainderLoop is true.
  271. static Loop *
  272. CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
  273. const bool UseEpilogRemainder, BasicBlock *InsertTop,
  274. BasicBlock *InsertBot, BasicBlock *Preheader,
  275. std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
  276. ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
  277. StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
  278. BasicBlock *Header = L->getHeader();
  279. BasicBlock *Latch = L->getLoopLatch();
  280. Function *F = Header->getParent();
  281. LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  282. LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  283. Loop *ParentLoop = L->getParentLoop();
  284. NewLoopsMap NewLoops;
  285. NewLoops[ParentLoop] = ParentLoop;
  286. if (!CreateRemainderLoop)
  287. NewLoops[L] = ParentLoop;
  288. // For each block in the original loop, create a new copy,
  289. // and update the value map with the newly created values.
  290. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  291. BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
  292. NewBlocks.push_back(NewBB);
  293. // If we're unrolling the outermost loop, there's no remainder loop,
  294. // and this block isn't in a nested loop, then the new block is not
  295. // in any loop. Otherwise, add it to loopinfo.
  296. if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
  297. addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
  298. VMap[*BB] = NewBB;
  299. if (Header == *BB) {
  300. // For the first block, add a CFG connection to this newly
  301. // created block.
  302. InsertTop->getTerminator()->setSuccessor(0, NewBB);
  303. }
  304. if (DT) {
  305. if (Header == *BB) {
  306. // The header is dominated by the preheader.
  307. DT->addNewBlock(NewBB, InsertTop);
  308. } else {
  309. // Copy information from original loop to unrolled loop.
  310. BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
  311. DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
  312. }
  313. }
  314. if (Latch == *BB) {
  315. // For the last block, if CreateRemainderLoop is false, create a direct
  316. // jump to InsertBot. If not, create a loop back to cloned head.
  317. VMap.erase((*BB)->getTerminator());
  318. BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
  319. BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
  320. IRBuilder<> Builder(LatchBR);
  321. if (!CreateRemainderLoop) {
  322. Builder.CreateBr(InsertBot);
  323. } else {
  324. PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
  325. suffix + ".iter",
  326. FirstLoopBB->getFirstNonPHI());
  327. Value *IdxSub =
  328. Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  329. NewIdx->getName() + ".sub");
  330. Value *IdxCmp =
  331. Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
  332. Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
  333. NewIdx->addIncoming(NewIter, InsertTop);
  334. NewIdx->addIncoming(IdxSub, NewBB);
  335. }
  336. LatchBR->eraseFromParent();
  337. }
  338. }
  339. // Change the incoming values to the ones defined in the preheader or
  340. // cloned loop.
  341. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  342. PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
  343. if (!CreateRemainderLoop) {
  344. if (UseEpilogRemainder) {
  345. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  346. NewPHI->setIncomingBlock(idx, InsertTop);
  347. NewPHI->removeIncomingValue(Latch, false);
  348. } else {
  349. VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
  350. cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
  351. }
  352. } else {
  353. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  354. NewPHI->setIncomingBlock(idx, InsertTop);
  355. BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
  356. idx = NewPHI->getBasicBlockIndex(Latch);
  357. Value *InVal = NewPHI->getIncomingValue(idx);
  358. NewPHI->setIncomingBlock(idx, NewLatch);
  359. if (Value *V = VMap.lookup(InVal))
  360. NewPHI->setIncomingValue(idx, V);
  361. }
  362. }
  363. if (CreateRemainderLoop) {
  364. Loop *NewLoop = NewLoops[L];
  365. assert(NewLoop && "L should have been cloned");
  366. // Add unroll disable metadata to disable future unrolling for this loop.
  367. SmallVector<Metadata *, 4> MDs;
  368. // Reserve first location for self reference to the LoopID metadata node.
  369. MDs.push_back(nullptr);
  370. MDNode *LoopID = NewLoop->getLoopID();
  371. if (LoopID) {
  372. // First remove any existing loop unrolling metadata.
  373. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
  374. bool IsUnrollMetadata = false;
  375. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  376. if (MD) {
  377. const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  378. IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
  379. }
  380. if (!IsUnrollMetadata)
  381. MDs.push_back(LoopID->getOperand(i));
  382. }
  383. }
  384. LLVMContext &Context = NewLoop->getHeader()->getContext();
  385. SmallVector<Metadata *, 1> DisableOperands;
  386. DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
  387. MDNode *DisableNode = MDNode::get(Context, DisableOperands);
  388. MDs.push_back(DisableNode);
  389. MDNode *NewLoopID = MDNode::get(Context, MDs);
  390. // Set operand 0 to refer to the loop id itself.
  391. NewLoopID->replaceOperandWith(0, NewLoopID);
  392. NewLoop->setLoopID(NewLoopID);
  393. return NewLoop;
  394. }
  395. else
  396. return nullptr;
  397. }
  398. /// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
  399. /// is populated with all the loop exit blocks other than the LatchExit block.
  400. static bool
  401. canSafelyUnrollMultiExitLoop(Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits,
  402. BasicBlock *LatchExit, bool PreserveLCSSA,
  403. bool UseEpilogRemainder) {
  404. // Support runtime unrolling for multiple exit blocks and multiple exiting
  405. // blocks.
  406. if (!UnrollRuntimeMultiExit)
  407. return false;
  408. // Even if runtime multi exit is enabled, we currently have some correctness
  409. // constrains in unrolling a multi-exit loop.
  410. // We rely on LCSSA form being preserved when the exit blocks are transformed.
  411. if (!PreserveLCSSA)
  412. return false;
  413. SmallVector<BasicBlock *, 4> Exits;
  414. L->getUniqueExitBlocks(Exits);
  415. for (auto *BB : Exits)
  416. if (BB != LatchExit)
  417. OtherExits.push_back(BB);
  418. // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
  419. // UnrollRuntimeMultiExit is true. This will need updating the logic in
  420. // connectEpilog/connectProlog.
  421. if (!LatchExit->getSinglePredecessor()) {
  422. DEBUG(dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
  423. "predecessor.\n");
  424. return false;
  425. }
  426. // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
  427. // and L is an inner loop. This is because in presence of multiple exits, the
  428. // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
  429. // outer loop. This is automatically handled in the prolog case, so we do not
  430. // have that bug in prolog generation.
  431. if (UseEpilogRemainder && L->getParentLoop())
  432. return false;
  433. // All constraints have been satisfied.
  434. return true;
  435. }
  436. /// Insert code in the prolog/epilog code when unrolling a loop with a
  437. /// run-time trip-count.
  438. ///
  439. /// This method assumes that the loop unroll factor is total number
  440. /// of loop bodies in the loop after unrolling. (Some folks refer
  441. /// to the unroll factor as the number of *extra* copies added).
  442. /// We assume also that the loop unroll factor is a power-of-two. So, after
  443. /// unrolling the loop, the number of loop bodies executed is 2,
  444. /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
  445. /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
  446. /// the switch instruction is generated.
  447. ///
  448. /// ***Prolog case***
  449. /// extraiters = tripcount % loopfactor
  450. /// if (extraiters == 0) jump Loop:
  451. /// else jump Prol:
  452. /// Prol: LoopBody;
  453. /// extraiters -= 1 // Omitted if unroll factor is 2.
  454. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
  455. /// if (tripcount < loopfactor) jump End:
  456. /// Loop:
  457. /// ...
  458. /// End:
  459. ///
  460. /// ***Epilog case***
  461. /// extraiters = tripcount % loopfactor
  462. /// if (tripcount < loopfactor) jump LoopExit:
  463. /// unroll_iters = tripcount - extraiters
  464. /// Loop: LoopBody; (executes unroll_iter times);
  465. /// unroll_iter -= 1
  466. /// if (unroll_iter != 0) jump Loop:
  467. /// LoopExit:
  468. /// if (extraiters == 0) jump EpilExit:
  469. /// Epil: LoopBody; (executes extraiters times)
  470. /// extraiters -= 1 // Omitted if unroll factor is 2.
  471. /// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
  472. /// EpilExit:
  473. bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
  474. bool AllowExpensiveTripCount,
  475. bool UseEpilogRemainder,
  476. LoopInfo *LI, ScalarEvolution *SE,
  477. DominatorTree *DT, bool PreserveLCSSA) {
  478. DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
  479. DEBUG(L->dump());
  480. // Make sure the loop is in canonical form.
  481. if (!L->isLoopSimplifyForm()) {
  482. DEBUG(dbgs() << "Not in simplify form!\n");
  483. return false;
  484. }
  485. // Guaranteed by LoopSimplifyForm.
  486. BasicBlock *Latch = L->getLoopLatch();
  487. BasicBlock *Header = L->getHeader();
  488. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  489. unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
  490. BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
  491. // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
  492. // targets of the Latch be an exit block out of the loop. This needs
  493. // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
  494. assert(!L->contains(LatchExit) &&
  495. "one of the loop latch successors should be the exit block!");
  496. // These are exit blocks other than the target of the latch exiting block.
  497. SmallVector<BasicBlock *, 4> OtherExits;
  498. bool isMultiExitUnrollingEnabled = canSafelyUnrollMultiExitLoop(
  499. L, OtherExits, LatchExit, PreserveLCSSA, UseEpilogRemainder);
  500. // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
  501. if (!isMultiExitUnrollingEnabled &&
  502. (!L->getExitingBlock() || OtherExits.size())) {
  503. DEBUG(
  504. dbgs()
  505. << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
  506. "enabled!\n");
  507. return false;
  508. }
  509. // Use Scalar Evolution to compute the trip count. This allows more loops to
  510. // be unrolled than relying on induction var simplification.
  511. if (!SE)
  512. return false;
  513. // Only unroll loops with a computable trip count, and the trip count needs
  514. // to be an int value (allowing a pointer type is a TODO item).
  515. // We calculate the backedge count by using getExitCount on the Latch block,
  516. // which is proven to be the only exiting block in this loop. This is same as
  517. // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
  518. // exiting blocks).
  519. const SCEV *BECountSC = SE->getExitCount(L, Latch);
  520. if (isa<SCEVCouldNotCompute>(BECountSC) ||
  521. !BECountSC->getType()->isIntegerTy()) {
  522. DEBUG(dbgs() << "Could not compute exit block SCEV\n");
  523. return false;
  524. }
  525. unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  526. // Add 1 since the backedge count doesn't include the first loop iteration.
  527. const SCEV *TripCountSC =
  528. SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  529. if (isa<SCEVCouldNotCompute>(TripCountSC)) {
  530. DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
  531. return false;
  532. }
  533. BasicBlock *PreHeader = L->getLoopPreheader();
  534. BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  535. const DataLayout &DL = Header->getModule()->getDataLayout();
  536. SCEVExpander Expander(*SE, DL, "loop-unroll");
  537. if (!AllowExpensiveTripCount &&
  538. Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
  539. DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
  540. return false;
  541. }
  542. // This constraint lets us deal with an overflowing trip count easily; see the
  543. // comment on ModVal below.
  544. if (Log2_32(Count) > BEWidth) {
  545. DEBUG(dbgs()
  546. << "Count failed constraint on overflow trip count calculation.\n");
  547. return false;
  548. }
  549. // Loop structure is the following:
  550. //
  551. // PreHeader
  552. // Header
  553. // ...
  554. // Latch
  555. // LatchExit
  556. BasicBlock *NewPreHeader;
  557. BasicBlock *NewExit = nullptr;
  558. BasicBlock *PrologExit = nullptr;
  559. BasicBlock *EpilogPreHeader = nullptr;
  560. BasicBlock *PrologPreHeader = nullptr;
  561. if (UseEpilogRemainder) {
  562. // If epilog remainder
  563. // Split PreHeader to insert a branch around loop for unrolling.
  564. NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
  565. NewPreHeader->setName(PreHeader->getName() + ".new");
  566. // Split LatchExit to create phi nodes from branch above.
  567. SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
  568. NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",
  569. DT, LI, PreserveLCSSA);
  570. // Split NewExit to insert epilog remainder loop.
  571. EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
  572. EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
  573. } else {
  574. // If prolog remainder
  575. // Split the original preheader twice to insert prolog remainder loop
  576. PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
  577. PrologPreHeader->setName(Header->getName() + ".prol.preheader");
  578. PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
  579. DT, LI);
  580. PrologExit->setName(Header->getName() + ".prol.loopexit");
  581. // Split PrologExit to get NewPreHeader.
  582. NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
  583. NewPreHeader->setName(PreHeader->getName() + ".new");
  584. }
  585. // Loop structure should be the following:
  586. // Epilog Prolog
  587. //
  588. // PreHeader PreHeader
  589. // *NewPreHeader *PrologPreHeader
  590. // Header *PrologExit
  591. // ... *NewPreHeader
  592. // Latch Header
  593. // *NewExit ...
  594. // *EpilogPreHeader Latch
  595. // LatchExit LatchExit
  596. // Calculate conditions for branch around loop for unrolling
  597. // in epilog case and around prolog remainder loop in prolog case.
  598. // Compute the number of extra iterations required, which is:
  599. // extra iterations = run-time trip count % loop unroll factor
  600. PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  601. Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
  602. PreHeaderBR);
  603. Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
  604. PreHeaderBR);
  605. IRBuilder<> B(PreHeaderBR);
  606. Value *ModVal;
  607. // Calculate ModVal = (BECount + 1) % Count.
  608. // Note that TripCount is BECount + 1.
  609. if (isPowerOf2_32(Count)) {
  610. // When Count is power of 2 we don't BECount for epilog case, however we'll
  611. // need it for a branch around unrolling loop for prolog case.
  612. ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
  613. // 1. There are no iterations to be run in the prolog/epilog loop.
  614. // OR
  615. // 2. The addition computing TripCount overflowed.
  616. //
  617. // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
  618. // the number of iterations that remain to be run in the original loop is a
  619. // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
  620. // explicitly check this above).
  621. } else {
  622. // As (BECount + 1) can potentially unsigned overflow we count
  623. // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
  624. Value *ModValTmp = B.CreateURem(BECount,
  625. ConstantInt::get(BECount->getType(),
  626. Count));
  627. Value *ModValAdd = B.CreateAdd(ModValTmp,
  628. ConstantInt::get(ModValTmp->getType(), 1));
  629. // At that point (BECount % Count) + 1 could be equal to Count.
  630. // To handle this case we need to take mod by Count one more time.
  631. ModVal = B.CreateURem(ModValAdd,
  632. ConstantInt::get(BECount->getType(), Count),
  633. "xtraiter");
  634. }
  635. Value *BranchVal =
  636. UseEpilogRemainder ? B.CreateICmpULT(BECount,
  637. ConstantInt::get(BECount->getType(),
  638. Count - 1)) :
  639. B.CreateIsNotNull(ModVal, "lcmp.mod");
  640. BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
  641. BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
  642. // Branch to either remainder (extra iterations) loop or unrolling loop.
  643. B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
  644. PreHeaderBR->eraseFromParent();
  645. if (DT) {
  646. if (UseEpilogRemainder)
  647. DT->changeImmediateDominator(NewExit, PreHeader);
  648. else
  649. DT->changeImmediateDominator(PrologExit, PreHeader);
  650. }
  651. Function *F = Header->getParent();
  652. // Get an ordered list of blocks in the loop to help with the ordering of the
  653. // cloned blocks in the prolog/epilog code
  654. LoopBlocksDFS LoopBlocks(L);
  655. LoopBlocks.perform(LI);
  656. //
  657. // For each extra loop iteration, create a copy of the loop's basic blocks
  658. // and generate a condition that branches to the copy depending on the
  659. // number of 'left over' iterations.
  660. //
  661. std::vector<BasicBlock *> NewBlocks;
  662. ValueToValueMapTy VMap;
  663. // For unroll factor 2 remainder loop will have 1 iterations.
  664. // Do not create 1 iteration loop.
  665. bool CreateRemainderLoop = (Count != 2);
  666. // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  667. // the loop, otherwise we create a cloned loop to execute the extra
  668. // iterations. This function adds the appropriate CFG connections.
  669. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
  670. BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
  671. Loop *remainderLoop = CloneLoopBlocks(
  672. L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
  673. NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
  674. // Insert the cloned blocks into the function.
  675. F->getBasicBlockList().splice(InsertBot->getIterator(),
  676. F->getBasicBlockList(),
  677. NewBlocks[0]->getIterator(),
  678. F->end());
  679. // Now the loop blocks are cloned and the other exiting blocks from the
  680. // remainder are connected to the original Loop's exit blocks. The remaining
  681. // work is to update the phi nodes in the original loop, and take in the
  682. // values from the cloned region. Also update the dominator info for
  683. // OtherExits and their immediate successors, since we have new edges into
  684. // OtherExits.
  685. SmallSet<BasicBlock*, 8> ImmediateSuccessorsOfExitBlocks;
  686. for (auto *BB : OtherExits) {
  687. for (auto &II : *BB) {
  688. // Given we preserve LCSSA form, we know that the values used outside the
  689. // loop will be used through these phi nodes at the exit blocks that are
  690. // transformed below.
  691. if (!isa<PHINode>(II))
  692. break;
  693. PHINode *Phi = cast<PHINode>(&II);
  694. unsigned oldNumOperands = Phi->getNumIncomingValues();
  695. // Add the incoming values from the remainder code to the end of the phi
  696. // node.
  697. for (unsigned i =0; i < oldNumOperands; i++){
  698. Value *newVal = VMap[Phi->getIncomingValue(i)];
  699. // newVal can be a constant or derived from values outside the loop, and
  700. // hence need not have a VMap value.
  701. if (!newVal)
  702. newVal = Phi->getIncomingValue(i);
  703. Phi->addIncoming(newVal,
  704. cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
  705. }
  706. }
  707. #ifdef EXPENSIVE_CHECKS
  708. for (BasicBlock *SuccBB : successors(BB)) {
  709. assert(!(any_of(OtherExits,
  710. [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
  711. SuccBB == LatchExit) &&
  712. "Breaks the definition of dedicated exits!");
  713. }
  714. #endif
  715. // Update the dominator info because the immediate dominator is no longer the
  716. // header of the original Loop. BB has edges both from L and remainder code.
  717. // Since the preheader determines which loop is run (L or directly jump to
  718. // the remainder code), we set the immediate dominator as the preheader.
  719. if (DT) {
  720. DT->changeImmediateDominator(BB, PreHeader);
  721. // Also update the IDom for immediate successors of BB. If the current
  722. // IDom is the header, update the IDom to be the preheader because that is
  723. // the nearest common dominator of all predecessors of SuccBB. We need to
  724. // check for IDom being the header because successors of exit blocks can
  725. // have edges from outside the loop, and we should not incorrectly update
  726. // the IDom in that case.
  727. for (BasicBlock *SuccBB: successors(BB))
  728. if (ImmediateSuccessorsOfExitBlocks.insert(SuccBB).second) {
  729. if (DT->getNode(SuccBB)->getIDom()->getBlock() == Header) {
  730. assert(!SuccBB->getSinglePredecessor() &&
  731. "BB should be the IDom then!");
  732. DT->changeImmediateDominator(SuccBB, PreHeader);
  733. }
  734. }
  735. }
  736. }
  737. // Loop structure should be the following:
  738. // Epilog Prolog
  739. //
  740. // PreHeader PreHeader
  741. // NewPreHeader PrologPreHeader
  742. // Header PrologHeader
  743. // ... ...
  744. // Latch PrologLatch
  745. // NewExit PrologExit
  746. // EpilogPreHeader NewPreHeader
  747. // EpilogHeader Header
  748. // ... ...
  749. // EpilogLatch Latch
  750. // LatchExit LatchExit
  751. // Rewrite the cloned instruction operands to use the values created when the
  752. // clone is created.
  753. for (BasicBlock *BB : NewBlocks) {
  754. for (Instruction &I : *BB) {
  755. RemapInstruction(&I, VMap,
  756. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  757. }
  758. }
  759. if (UseEpilogRemainder) {
  760. // Connect the epilog code to the original loop and update the
  761. // PHI functions.
  762. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
  763. EpilogPreHeader, NewPreHeader, VMap, DT, LI,
  764. PreserveLCSSA);
  765. // Update counter in loop for unrolling.
  766. // I should be multiply of Count.
  767. IRBuilder<> B2(NewPreHeader->getTerminator());
  768. Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
  769. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  770. B2.SetInsertPoint(LatchBR);
  771. PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
  772. Header->getFirstNonPHI());
  773. Value *IdxSub =
  774. B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  775. NewIdx->getName() + ".nsub");
  776. Value *IdxCmp;
  777. if (LatchBR->getSuccessor(0) == Header)
  778. IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
  779. else
  780. IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
  781. NewIdx->addIncoming(TestVal, NewPreHeader);
  782. NewIdx->addIncoming(IdxSub, Latch);
  783. LatchBR->setCondition(IdxCmp);
  784. } else {
  785. // Connect the prolog code to the original loop and update the
  786. // PHI functions.
  787. ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
  788. NewPreHeader, VMap, DT, LI, PreserveLCSSA);
  789. }
  790. // If this loop is nested, then the loop unroller changes the code in the
  791. // parent loop, so the Scalar Evolution pass needs to be run again.
  792. if (Loop *ParentLoop = L->getParentLoop())
  793. SE->forgetLoop(ParentLoop);
  794. // Canonicalize to LoopSimplifyForm both original and remainder loops. We
  795. // cannot rely on the LoopUnrollPass to do this because it only does
  796. // canonicalization for parent/subloops and not the sibling loops.
  797. if (OtherExits.size() > 0) {
  798. // Generate dedicated exit blocks for the original loop, to preserve
  799. // LoopSimplifyForm.
  800. formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
  801. // Generate dedicated exit blocks for the remainder loop if one exists, to
  802. // preserve LoopSimplifyForm.
  803. if (remainderLoop)
  804. formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
  805. }
  806. NumRuntimeUnrolled++;
  807. return true;
  808. }