LoopUnrollRuntime.cpp 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806
  1. //===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements some loop unrolling utilities for loops with run-time
  11. // trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
  12. // trip counts.
  13. //
  14. // The functions in this file are used to generate extra code when the
  15. // run-time trip count modulo the unroll factor is not 0. When this is the
  16. // case, we need to generate code to execute these 'left over' iterations.
  17. //
  18. // The current strategy generates an if-then-else sequence prior to the
  19. // unrolled loop to execute the 'left over' iterations before or after the
  20. // unrolled loop.
  21. //
  22. //===----------------------------------------------------------------------===//
  23. #include "llvm/ADT/Statistic.h"
  24. #include "llvm/Analysis/AliasAnalysis.h"
  25. #include "llvm/Analysis/LoopIterator.h"
  26. #include "llvm/Analysis/LoopPass.h"
  27. #include "llvm/Analysis/ScalarEvolution.h"
  28. #include "llvm/Analysis/ScalarEvolutionExpander.h"
  29. #include "llvm/IR/BasicBlock.h"
  30. #include "llvm/IR/Dominators.h"
  31. #include "llvm/IR/Metadata.h"
  32. #include "llvm/IR/Module.h"
  33. #include "llvm/Support/Debug.h"
  34. #include "llvm/Support/raw_ostream.h"
  35. #include "llvm/Transforms/Scalar.h"
  36. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  37. #include "llvm/Transforms/Utils/Cloning.h"
  38. #include "llvm/Transforms/Utils/LoopUtils.h"
  39. #include "llvm/Transforms/Utils/UnrollLoop.h"
  40. #include <algorithm>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "loop-unroll"
  43. STATISTIC(NumRuntimeUnrolled,
  44. "Number of loops unrolled with run-time trip counts");
  45. static cl::opt<bool> UnrollRuntimeMultiExit(
  46. "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
  47. cl::desc("Allow runtime unrolling for loops with multiple exits, when "
  48. "epilog is generated"));
  49. /// Connect the unrolling prolog code to the original loop.
  50. /// The unrolling prolog code contains code to execute the
  51. /// 'extra' iterations if the run-time trip count modulo the
  52. /// unroll count is non-zero.
  53. ///
  54. /// This function performs the following:
  55. /// - Create PHI nodes at prolog end block to combine values
  56. /// that exit the prolog code and jump around the prolog.
  57. /// - Add a PHI operand to a PHI node at the loop exit block
  58. /// for values that exit the prolog and go around the loop.
  59. /// - Branch around the original loop if the trip count is less
  60. /// than the unroll factor.
  61. ///
  62. static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
  63. BasicBlock *PrologExit,
  64. BasicBlock *OriginalLoopLatchExit,
  65. BasicBlock *PreHeader, BasicBlock *NewPreHeader,
  66. ValueToValueMapTy &VMap, DominatorTree *DT,
  67. LoopInfo *LI, bool PreserveLCSSA) {
  68. BasicBlock *Latch = L->getLoopLatch();
  69. assert(Latch && "Loop must have a latch");
  70. BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
  71. // Create a PHI node for each outgoing value from the original loop
  72. // (which means it is an outgoing value from the prolog code too).
  73. // The new PHI node is inserted in the prolog end basic block.
  74. // The new PHI node value is added as an operand of a PHI node in either
  75. // the loop header or the loop exit block.
  76. for (BasicBlock *Succ : successors(Latch)) {
  77. for (Instruction &BBI : *Succ) {
  78. PHINode *PN = dyn_cast<PHINode>(&BBI);
  79. // Exit when we passed all PHI nodes.
  80. if (!PN)
  81. break;
  82. // Add a new PHI node to the prolog end block and add the
  83. // appropriate incoming values.
  84. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
  85. PrologExit->getFirstNonPHI());
  86. // Adding a value to the new PHI node from the original loop preheader.
  87. // This is the value that skips all the prolog code.
  88. if (L->contains(PN)) {
  89. NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader),
  90. PreHeader);
  91. } else {
  92. NewPN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
  93. }
  94. Value *V = PN->getIncomingValueForBlock(Latch);
  95. if (Instruction *I = dyn_cast<Instruction>(V)) {
  96. if (L->contains(I)) {
  97. V = VMap.lookup(I);
  98. }
  99. }
  100. // Adding a value to the new PHI node from the last prolog block
  101. // that was created.
  102. NewPN->addIncoming(V, PrologLatch);
  103. // Update the existing PHI node operand with the value from the
  104. // new PHI node. How this is done depends on if the existing
  105. // PHI node is in the original loop block, or the exit block.
  106. if (L->contains(PN)) {
  107. PN->setIncomingValue(PN->getBasicBlockIndex(NewPreHeader), NewPN);
  108. } else {
  109. PN->addIncoming(NewPN, PrologExit);
  110. }
  111. }
  112. }
  113. // Make sure that created prolog loop is in simplified form
  114. SmallVector<BasicBlock *, 4> PrologExitPreds;
  115. Loop *PrologLoop = LI->getLoopFor(PrologLatch);
  116. if (PrologLoop) {
  117. for (BasicBlock *PredBB : predecessors(PrologExit))
  118. if (PrologLoop->contains(PredBB))
  119. PrologExitPreds.push_back(PredBB);
  120. SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
  121. PreserveLCSSA);
  122. }
  123. // Create a branch around the original loop, which is taken if there are no
  124. // iterations remaining to be executed after running the prologue.
  125. Instruction *InsertPt = PrologExit->getTerminator();
  126. IRBuilder<> B(InsertPt);
  127. assert(Count != 0 && "nonsensical Count!");
  128. // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
  129. // This means %xtraiter is (BECount + 1) and all of the iterations of this
  130. // loop were executed by the prologue. Note that if BECount <u (Count - 1)
  131. // then (BECount + 1) cannot unsigned-overflow.
  132. Value *BrLoopExit =
  133. B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
  134. // Split the exit to maintain loop canonicalization guarantees
  135. SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
  136. SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
  137. PreserveLCSSA);
  138. // Add the branch to the exit block (around the unrolled loop)
  139. B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
  140. InsertPt->eraseFromParent();
  141. if (DT)
  142. DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
  143. }
  144. /// Connect the unrolling epilog code to the original loop.
  145. /// The unrolling epilog code contains code to execute the
  146. /// 'extra' iterations if the run-time trip count modulo the
  147. /// unroll count is non-zero.
  148. ///
  149. /// This function performs the following:
  150. /// - Update PHI nodes at the unrolling loop exit and epilog loop exit
  151. /// - Create PHI nodes at the unrolling loop exit to combine
  152. /// values that exit the unrolling loop code and jump around it.
  153. /// - Update PHI operands in the epilog loop by the new PHI nodes
  154. /// - Branch around the epilog loop if extra iters (ModVal) is zero.
  155. ///
  156. static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
  157. BasicBlock *Exit, BasicBlock *PreHeader,
  158. BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
  159. ValueToValueMapTy &VMap, DominatorTree *DT,
  160. LoopInfo *LI, bool PreserveLCSSA) {
  161. BasicBlock *Latch = L->getLoopLatch();
  162. assert(Latch && "Loop must have a latch");
  163. BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
  164. // Loop structure should be the following:
  165. //
  166. // PreHeader
  167. // NewPreHeader
  168. // Header
  169. // ...
  170. // Latch
  171. // NewExit (PN)
  172. // EpilogPreHeader
  173. // EpilogHeader
  174. // ...
  175. // EpilogLatch
  176. // Exit (EpilogPN)
  177. // Update PHI nodes at NewExit and Exit.
  178. for (Instruction &BBI : *NewExit) {
  179. PHINode *PN = dyn_cast<PHINode>(&BBI);
  180. // Exit when we passed all PHI nodes.
  181. if (!PN)
  182. break;
  183. // PN should be used in another PHI located in Exit block as
  184. // Exit was split by SplitBlockPredecessors into Exit and NewExit
  185. // Basicaly it should look like:
  186. // NewExit:
  187. // PN = PHI [I, Latch]
  188. // ...
  189. // Exit:
  190. // EpilogPN = PHI [PN, EpilogPreHeader]
  191. //
  192. // There is EpilogPreHeader incoming block instead of NewExit as
  193. // NewExit was spilt 1 more time to get EpilogPreHeader.
  194. assert(PN->hasOneUse() && "The phi should have 1 use");
  195. PHINode *EpilogPN = cast<PHINode> (PN->use_begin()->getUser());
  196. assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
  197. // Add incoming PreHeader from branch around the Loop
  198. PN->addIncoming(UndefValue::get(PN->getType()), PreHeader);
  199. Value *V = PN->getIncomingValueForBlock(Latch);
  200. Instruction *I = dyn_cast<Instruction>(V);
  201. if (I && L->contains(I))
  202. // If value comes from an instruction in the loop add VMap value.
  203. V = VMap.lookup(I);
  204. // For the instruction out of the loop, constant or undefined value
  205. // insert value itself.
  206. EpilogPN->addIncoming(V, EpilogLatch);
  207. assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
  208. "EpilogPN should have EpilogPreHeader incoming block");
  209. // Change EpilogPreHeader incoming block to NewExit.
  210. EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
  211. NewExit);
  212. // Now PHIs should look like:
  213. // NewExit:
  214. // PN = PHI [I, Latch], [undef, PreHeader]
  215. // ...
  216. // Exit:
  217. // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
  218. }
  219. // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
  220. // Update corresponding PHI nodes in epilog loop.
  221. for (BasicBlock *Succ : successors(Latch)) {
  222. // Skip this as we already updated phis in exit blocks.
  223. if (!L->contains(Succ))
  224. continue;
  225. for (Instruction &BBI : *Succ) {
  226. PHINode *PN = dyn_cast<PHINode>(&BBI);
  227. // Exit when we passed all PHI nodes.
  228. if (!PN)
  229. break;
  230. // Add new PHI nodes to the loop exit block and update epilog
  231. // PHIs with the new PHI values.
  232. PHINode *NewPN = PHINode::Create(PN->getType(), 2, PN->getName() + ".unr",
  233. NewExit->getFirstNonPHI());
  234. // Adding a value to the new PHI node from the unrolling loop preheader.
  235. NewPN->addIncoming(PN->getIncomingValueForBlock(NewPreHeader), PreHeader);
  236. // Adding a value to the new PHI node from the unrolling loop latch.
  237. NewPN->addIncoming(PN->getIncomingValueForBlock(Latch), Latch);
  238. // Update the existing PHI node operand with the value from the new PHI
  239. // node. Corresponding instruction in epilog loop should be PHI.
  240. PHINode *VPN = cast<PHINode>(VMap[&BBI]);
  241. VPN->setIncomingValue(VPN->getBasicBlockIndex(EpilogPreHeader), NewPN);
  242. }
  243. }
  244. Instruction *InsertPt = NewExit->getTerminator();
  245. IRBuilder<> B(InsertPt);
  246. Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
  247. assert(Exit && "Loop must have a single exit block only");
  248. // Split the epilogue exit to maintain loop canonicalization guarantees
  249. SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
  250. SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI,
  251. PreserveLCSSA);
  252. // Add the branch to the exit block (around the unrolling loop)
  253. B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
  254. InsertPt->eraseFromParent();
  255. if (DT)
  256. DT->changeImmediateDominator(Exit, NewExit);
  257. // Split the main loop exit to maintain canonicalization guarantees.
  258. SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
  259. SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI,
  260. PreserveLCSSA);
  261. }
  262. /// Create a clone of the blocks in a loop and connect them together.
  263. /// If CreateRemainderLoop is false, loop structure will not be cloned,
  264. /// otherwise a new loop will be created including all cloned blocks, and the
  265. /// iterator of it switches to count NewIter down to 0.
  266. /// The cloned blocks should be inserted between InsertTop and InsertBot.
  267. /// If loop structure is cloned InsertTop should be new preheader, InsertBot
  268. /// new loop exit.
  269. /// Return the new cloned loop that is created when CreateRemainderLoop is true.
  270. static Loop *
  271. CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
  272. const bool UseEpilogRemainder, BasicBlock *InsertTop,
  273. BasicBlock *InsertBot, BasicBlock *Preheader,
  274. std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
  275. ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
  276. StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
  277. BasicBlock *Header = L->getHeader();
  278. BasicBlock *Latch = L->getLoopLatch();
  279. Function *F = Header->getParent();
  280. LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
  281. LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
  282. Loop *ParentLoop = L->getParentLoop();
  283. NewLoopsMap NewLoops;
  284. NewLoops[ParentLoop] = ParentLoop;
  285. if (!CreateRemainderLoop)
  286. NewLoops[L] = ParentLoop;
  287. // For each block in the original loop, create a new copy,
  288. // and update the value map with the newly created values.
  289. for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
  290. BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
  291. NewBlocks.push_back(NewBB);
  292. // If we're unrolling the outermost loop, there's no remainder loop,
  293. // and this block isn't in a nested loop, then the new block is not
  294. // in any loop. Otherwise, add it to loopinfo.
  295. if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
  296. addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
  297. VMap[*BB] = NewBB;
  298. if (Header == *BB) {
  299. // For the first block, add a CFG connection to this newly
  300. // created block.
  301. InsertTop->getTerminator()->setSuccessor(0, NewBB);
  302. }
  303. if (DT) {
  304. if (Header == *BB) {
  305. // The header is dominated by the preheader.
  306. DT->addNewBlock(NewBB, InsertTop);
  307. } else {
  308. // Copy information from original loop to unrolled loop.
  309. BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
  310. DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
  311. }
  312. }
  313. if (Latch == *BB) {
  314. // For the last block, if CreateRemainderLoop is false, create a direct
  315. // jump to InsertBot. If not, create a loop back to cloned head.
  316. VMap.erase((*BB)->getTerminator());
  317. BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
  318. BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
  319. IRBuilder<> Builder(LatchBR);
  320. if (!CreateRemainderLoop) {
  321. Builder.CreateBr(InsertBot);
  322. } else {
  323. PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
  324. suffix + ".iter",
  325. FirstLoopBB->getFirstNonPHI());
  326. Value *IdxSub =
  327. Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  328. NewIdx->getName() + ".sub");
  329. Value *IdxCmp =
  330. Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
  331. Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
  332. NewIdx->addIncoming(NewIter, InsertTop);
  333. NewIdx->addIncoming(IdxSub, NewBB);
  334. }
  335. LatchBR->eraseFromParent();
  336. }
  337. }
  338. // Change the incoming values to the ones defined in the preheader or
  339. // cloned loop.
  340. for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
  341. PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
  342. if (!CreateRemainderLoop) {
  343. if (UseEpilogRemainder) {
  344. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  345. NewPHI->setIncomingBlock(idx, InsertTop);
  346. NewPHI->removeIncomingValue(Latch, false);
  347. } else {
  348. VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
  349. cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
  350. }
  351. } else {
  352. unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
  353. NewPHI->setIncomingBlock(idx, InsertTop);
  354. BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
  355. idx = NewPHI->getBasicBlockIndex(Latch);
  356. Value *InVal = NewPHI->getIncomingValue(idx);
  357. NewPHI->setIncomingBlock(idx, NewLatch);
  358. if (Value *V = VMap.lookup(InVal))
  359. NewPHI->setIncomingValue(idx, V);
  360. }
  361. }
  362. if (CreateRemainderLoop) {
  363. Loop *NewLoop = NewLoops[L];
  364. assert(NewLoop && "L should have been cloned");
  365. // Add unroll disable metadata to disable future unrolling for this loop.
  366. SmallVector<Metadata *, 4> MDs;
  367. // Reserve first location for self reference to the LoopID metadata node.
  368. MDs.push_back(nullptr);
  369. MDNode *LoopID = NewLoop->getLoopID();
  370. if (LoopID) {
  371. // First remove any existing loop unrolling metadata.
  372. for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
  373. bool IsUnrollMetadata = false;
  374. MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
  375. if (MD) {
  376. const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
  377. IsUnrollMetadata = S && S->getString().startswith("llvm.loop.unroll.");
  378. }
  379. if (!IsUnrollMetadata)
  380. MDs.push_back(LoopID->getOperand(i));
  381. }
  382. }
  383. LLVMContext &Context = NewLoop->getHeader()->getContext();
  384. SmallVector<Metadata *, 1> DisableOperands;
  385. DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
  386. MDNode *DisableNode = MDNode::get(Context, DisableOperands);
  387. MDs.push_back(DisableNode);
  388. MDNode *NewLoopID = MDNode::get(Context, MDs);
  389. // Set operand 0 to refer to the loop id itself.
  390. NewLoopID->replaceOperandWith(0, NewLoopID);
  391. NewLoop->setLoopID(NewLoopID);
  392. return NewLoop;
  393. }
  394. else
  395. return nullptr;
  396. }
  397. /// Insert code in the prolog/epilog code when unrolling a loop with a
  398. /// run-time trip-count.
  399. ///
  400. /// This method assumes that the loop unroll factor is total number
  401. /// of loop bodies in the loop after unrolling. (Some folks refer
  402. /// to the unroll factor as the number of *extra* copies added).
  403. /// We assume also that the loop unroll factor is a power-of-two. So, after
  404. /// unrolling the loop, the number of loop bodies executed is 2,
  405. /// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
  406. /// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
  407. /// the switch instruction is generated.
  408. ///
  409. /// ***Prolog case***
  410. /// extraiters = tripcount % loopfactor
  411. /// if (extraiters == 0) jump Loop:
  412. /// else jump Prol:
  413. /// Prol: LoopBody;
  414. /// extraiters -= 1 // Omitted if unroll factor is 2.
  415. /// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
  416. /// if (tripcount < loopfactor) jump End:
  417. /// Loop:
  418. /// ...
  419. /// End:
  420. ///
  421. /// ***Epilog case***
  422. /// extraiters = tripcount % loopfactor
  423. /// if (tripcount < loopfactor) jump LoopExit:
  424. /// unroll_iters = tripcount - extraiters
  425. /// Loop: LoopBody; (executes unroll_iter times);
  426. /// unroll_iter -= 1
  427. /// if (unroll_iter != 0) jump Loop:
  428. /// LoopExit:
  429. /// if (extraiters == 0) jump EpilExit:
  430. /// Epil: LoopBody; (executes extraiters times)
  431. /// extraiters -= 1 // Omitted if unroll factor is 2.
  432. /// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
  433. /// EpilExit:
  434. bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
  435. bool AllowExpensiveTripCount,
  436. bool UseEpilogRemainder,
  437. LoopInfo *LI, ScalarEvolution *SE,
  438. DominatorTree *DT, bool PreserveLCSSA) {
  439. // for now, only unroll loops that contain a single exit
  440. if (!UnrollRuntimeMultiExit && !L->getExitingBlock())
  441. return false;
  442. // Make sure the loop is in canonical form.
  443. if (!L->isLoopSimplifyForm())
  444. return false;
  445. // Guaranteed by LoopSimplifyForm.
  446. BasicBlock *Latch = L->getLoopLatch();
  447. BasicBlock *Header = L->getHeader();
  448. BasicBlock *LatchExit = L->getUniqueExitBlock(); // successor out of loop
  449. if (!LatchExit && !UnrollRuntimeMultiExit)
  450. return false;
  451. // These are exit blocks other than the target of the latch exiting block.
  452. SmallVector<BasicBlock *, 4> OtherExits;
  453. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  454. unsigned int ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
  455. // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
  456. // targets of the Latch be an exit block out of the loop. This needs
  457. // to be guaranteed by the callers of UnrollRuntimeLoopRemainder.
  458. assert(!L->contains(LatchBR->getSuccessor(ExitIndex)) &&
  459. "one of the loop latch successors should be the exit block!");
  460. // Support runtime unrolling for multiple exit blocks and multiple exiting
  461. // blocks.
  462. if (!LatchExit) {
  463. LatchExit = LatchBR->getSuccessor(ExitIndex);
  464. // We rely on LCSSA form being preserved when the exit blocks are
  465. // transformed.
  466. if (!PreserveLCSSA)
  467. return false;
  468. SmallVector<BasicBlock *, 4> Exits;
  469. L->getUniqueExitBlocks(Exits);
  470. for (auto *BB : Exits)
  471. if (BB != LatchExit)
  472. OtherExits.push_back(BB);
  473. }
  474. assert(LatchExit && "Latch Exit should exist!");
  475. // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
  476. // UnrollRuntimeMultiExit is true. This will need updating the logic in
  477. // connectEpilog.
  478. if (!LatchExit->getSinglePredecessor())
  479. return false;
  480. // Use Scalar Evolution to compute the trip count. This allows more loops to
  481. // be unrolled than relying on induction var simplification.
  482. if (!SE)
  483. return false;
  484. // Only unroll loops with a computable trip count, and the trip count needs
  485. // to be an int value (allowing a pointer type is a TODO item).
  486. // We calculate the backedge count by using getExitCount on the Latch block,
  487. // which is proven to be the only exiting block in this loop. This is same as
  488. // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
  489. // exiting blocks).
  490. const SCEV *BECountSC = SE->getExitCount(L, Latch);
  491. if (isa<SCEVCouldNotCompute>(BECountSC) ||
  492. !BECountSC->getType()->isIntegerTy())
  493. return false;
  494. unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
  495. // Add 1 since the backedge count doesn't include the first loop iteration.
  496. const SCEV *TripCountSC =
  497. SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
  498. if (isa<SCEVCouldNotCompute>(TripCountSC))
  499. return false;
  500. BasicBlock *PreHeader = L->getLoopPreheader();
  501. BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  502. const DataLayout &DL = Header->getModule()->getDataLayout();
  503. SCEVExpander Expander(*SE, DL, "loop-unroll");
  504. if (!AllowExpensiveTripCount &&
  505. Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
  506. return false;
  507. // This constraint lets us deal with an overflowing trip count easily; see the
  508. // comment on ModVal below.
  509. if (Log2_32(Count) > BEWidth)
  510. return false;
  511. // Loop structure is the following:
  512. //
  513. // PreHeader
  514. // Header
  515. // ...
  516. // Latch
  517. // LatchExit
  518. BasicBlock *NewPreHeader;
  519. BasicBlock *NewExit = nullptr;
  520. BasicBlock *PrologExit = nullptr;
  521. BasicBlock *EpilogPreHeader = nullptr;
  522. BasicBlock *PrologPreHeader = nullptr;
  523. if (UseEpilogRemainder) {
  524. // If epilog remainder
  525. // Split PreHeader to insert a branch around loop for unrolling.
  526. NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
  527. NewPreHeader->setName(PreHeader->getName() + ".new");
  528. // Split LatchExit to create phi nodes from branch above.
  529. SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
  530. NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa",
  531. DT, LI, PreserveLCSSA);
  532. // Split NewExit to insert epilog remainder loop.
  533. EpilogPreHeader = SplitBlock(NewExit, NewExit->getTerminator(), DT, LI);
  534. EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
  535. } else {
  536. // If prolog remainder
  537. // Split the original preheader twice to insert prolog remainder loop
  538. PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
  539. PrologPreHeader->setName(Header->getName() + ".prol.preheader");
  540. PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
  541. DT, LI);
  542. PrologExit->setName(Header->getName() + ".prol.loopexit");
  543. // Split PrologExit to get NewPreHeader.
  544. NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
  545. NewPreHeader->setName(PreHeader->getName() + ".new");
  546. }
  547. // Loop structure should be the following:
  548. // Epilog Prolog
  549. //
  550. // PreHeader PreHeader
  551. // *NewPreHeader *PrologPreHeader
  552. // Header *PrologExit
  553. // ... *NewPreHeader
  554. // Latch Header
  555. // *NewExit ...
  556. // *EpilogPreHeader Latch
  557. // LatchExit LatchExit
  558. // Calculate conditions for branch around loop for unrolling
  559. // in epilog case and around prolog remainder loop in prolog case.
  560. // Compute the number of extra iterations required, which is:
  561. // extra iterations = run-time trip count % loop unroll factor
  562. PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
  563. Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
  564. PreHeaderBR);
  565. Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
  566. PreHeaderBR);
  567. IRBuilder<> B(PreHeaderBR);
  568. Value *ModVal;
  569. // Calculate ModVal = (BECount + 1) % Count.
  570. // Note that TripCount is BECount + 1.
  571. if (isPowerOf2_32(Count)) {
  572. // When Count is power of 2 we don't BECount for epilog case, however we'll
  573. // need it for a branch around unrolling loop for prolog case.
  574. ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
  575. // 1. There are no iterations to be run in the prolog/epilog loop.
  576. // OR
  577. // 2. The addition computing TripCount overflowed.
  578. //
  579. // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
  580. // the number of iterations that remain to be run in the original loop is a
  581. // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
  582. // explicitly check this above).
  583. } else {
  584. // As (BECount + 1) can potentially unsigned overflow we count
  585. // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
  586. Value *ModValTmp = B.CreateURem(BECount,
  587. ConstantInt::get(BECount->getType(),
  588. Count));
  589. Value *ModValAdd = B.CreateAdd(ModValTmp,
  590. ConstantInt::get(ModValTmp->getType(), 1));
  591. // At that point (BECount % Count) + 1 could be equal to Count.
  592. // To handle this case we need to take mod by Count one more time.
  593. ModVal = B.CreateURem(ModValAdd,
  594. ConstantInt::get(BECount->getType(), Count),
  595. "xtraiter");
  596. }
  597. Value *BranchVal =
  598. UseEpilogRemainder ? B.CreateICmpULT(BECount,
  599. ConstantInt::get(BECount->getType(),
  600. Count - 1)) :
  601. B.CreateIsNotNull(ModVal, "lcmp.mod");
  602. BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
  603. BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
  604. // Branch to either remainder (extra iterations) loop or unrolling loop.
  605. B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
  606. PreHeaderBR->eraseFromParent();
  607. if (DT) {
  608. if (UseEpilogRemainder)
  609. DT->changeImmediateDominator(NewExit, PreHeader);
  610. else
  611. DT->changeImmediateDominator(PrologExit, PreHeader);
  612. }
  613. Function *F = Header->getParent();
  614. // Get an ordered list of blocks in the loop to help with the ordering of the
  615. // cloned blocks in the prolog/epilog code
  616. LoopBlocksDFS LoopBlocks(L);
  617. LoopBlocks.perform(LI);
  618. //
  619. // For each extra loop iteration, create a copy of the loop's basic blocks
  620. // and generate a condition that branches to the copy depending on the
  621. // number of 'left over' iterations.
  622. //
  623. std::vector<BasicBlock *> NewBlocks;
  624. ValueToValueMapTy VMap;
  625. // For unroll factor 2 remainder loop will have 1 iterations.
  626. // Do not create 1 iteration loop.
  627. bool CreateRemainderLoop = (Count != 2);
  628. // Clone all the basic blocks in the loop. If Count is 2, we don't clone
  629. // the loop, otherwise we create a cloned loop to execute the extra
  630. // iterations. This function adds the appropriate CFG connections.
  631. BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
  632. BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
  633. Loop *remainderLoop = CloneLoopBlocks(
  634. L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
  635. NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
  636. // Insert the cloned blocks into the function.
  637. F->getBasicBlockList().splice(InsertBot->getIterator(),
  638. F->getBasicBlockList(),
  639. NewBlocks[0]->getIterator(),
  640. F->end());
  641. // Now the loop blocks are cloned and the other exiting blocks from the
  642. // remainder are connected to the original Loop's exit blocks. The remaining
  643. // work is to update the phi nodes in the original loop, and take in the
  644. // values from the cloned region. Also update the dominator info for
  645. // OtherExits, since we have new edges into OtherExits.
  646. for (auto *BB : OtherExits) {
  647. for (auto &II : *BB) {
  648. // Given we preserve LCSSA form, we know that the values used outside the
  649. // loop will be used through these phi nodes at the exit blocks that are
  650. // transformed below.
  651. if (!isa<PHINode>(II))
  652. break;
  653. PHINode *Phi = cast<PHINode>(&II);
  654. unsigned oldNumOperands = Phi->getNumIncomingValues();
  655. // Add the incoming values from the remainder code to the end of the phi
  656. // node.
  657. for (unsigned i =0; i < oldNumOperands; i++){
  658. Value *newVal = VMap[Phi->getIncomingValue(i)];
  659. // newVal can be a constant or derived from values outside the loop, and
  660. // hence need not have a VMap value.
  661. if (!newVal)
  662. newVal = Phi->getIncomingValue(i);
  663. Phi->addIncoming(newVal,
  664. cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
  665. }
  666. }
  667. // Update the dominator info because the immediate dominator is no longer the
  668. // header of the original Loop. BB has edges both from L and remainder code.
  669. // Since the preheader determines which loop is run (L or directly jump to
  670. // the remainder code), we set the immediate dominator as the preheader.
  671. if (DT)
  672. DT->changeImmediateDominator(BB, PreHeader);
  673. }
  674. // Loop structure should be the following:
  675. // Epilog Prolog
  676. //
  677. // PreHeader PreHeader
  678. // NewPreHeader PrologPreHeader
  679. // Header PrologHeader
  680. // ... ...
  681. // Latch PrologLatch
  682. // NewExit PrologExit
  683. // EpilogPreHeader NewPreHeader
  684. // EpilogHeader Header
  685. // ... ...
  686. // EpilogLatch Latch
  687. // LatchExit LatchExit
  688. // Rewrite the cloned instruction operands to use the values created when the
  689. // clone is created.
  690. for (BasicBlock *BB : NewBlocks) {
  691. for (Instruction &I : *BB) {
  692. RemapInstruction(&I, VMap,
  693. RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
  694. }
  695. }
  696. if (UseEpilogRemainder) {
  697. // Connect the epilog code to the original loop and update the
  698. // PHI functions.
  699. ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
  700. EpilogPreHeader, NewPreHeader, VMap, DT, LI,
  701. PreserveLCSSA);
  702. // Update counter in loop for unrolling.
  703. // I should be multiply of Count.
  704. IRBuilder<> B2(NewPreHeader->getTerminator());
  705. Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
  706. BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
  707. B2.SetInsertPoint(LatchBR);
  708. PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
  709. Header->getFirstNonPHI());
  710. Value *IdxSub =
  711. B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
  712. NewIdx->getName() + ".nsub");
  713. Value *IdxCmp;
  714. if (LatchBR->getSuccessor(0) == Header)
  715. IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
  716. else
  717. IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
  718. NewIdx->addIncoming(TestVal, NewPreHeader);
  719. NewIdx->addIncoming(IdxSub, Latch);
  720. LatchBR->setCondition(IdxCmp);
  721. } else {
  722. // Connect the prolog code to the original loop and update the
  723. // PHI functions.
  724. ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
  725. NewPreHeader, VMap, DT, LI, PreserveLCSSA);
  726. }
  727. // If this loop is nested, then the loop unroller changes the code in the
  728. // parent loop, so the Scalar Evolution pass needs to be run again.
  729. if (Loop *ParentLoop = L->getParentLoop())
  730. SE->forgetLoop(ParentLoop);
  731. // Canonicalize to LoopSimplifyForm both original and remainder loops. We
  732. // cannot rely on the LoopUnrollPass to do this because it only does
  733. // canonicalization for parent/subloops and not the sibling loops.
  734. if (OtherExits.size() > 0) {
  735. // Generate dedicated exit blocks for the original loop, to preserve
  736. // LoopSimplifyForm.
  737. formDedicatedExitBlocks(L, DT, LI, PreserveLCSSA);
  738. // Generate dedicated exit blocks for the remainder loop if one exists, to
  739. // preserve LoopSimplifyForm.
  740. if (remainderLoop)
  741. formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
  742. }
  743. NumRuntimeUnrolled++;
  744. return true;
  745. }