LoopUnrollPass.cpp 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353
  1. //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This pass implements a simple loop unroller. It works best when loops have
  11. // been canonicalized by the -indvars pass, allowing it to determine the trip
  12. // counts of loops easily.
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
  15. #include "llvm/ADT/DenseMap.h"
  16. #include "llvm/ADT/DenseMapInfo.h"
  17. #include "llvm/ADT/DenseSet.h"
  18. #include "llvm/ADT/None.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/STLExtras.h"
  21. #include "llvm/ADT/SetVector.h"
  22. #include "llvm/ADT/SmallPtrSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/StringRef.h"
  25. #include "llvm/Analysis/AssumptionCache.h"
  26. #include "llvm/Analysis/CodeMetrics.h"
  27. #include "llvm/Analysis/LoopAnalysisManager.h"
  28. #include "llvm/Analysis/LoopInfo.h"
  29. #include "llvm/Analysis/LoopPass.h"
  30. #include "llvm/Analysis/LoopUnrollAnalyzer.h"
  31. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  32. #include "llvm/Analysis/ProfileSummaryInfo.h"
  33. #include "llvm/Analysis/ScalarEvolution.h"
  34. #include "llvm/Analysis/TargetTransformInfo.h"
  35. #include "llvm/IR/BasicBlock.h"
  36. #include "llvm/IR/CFG.h"
  37. #include "llvm/IR/Constant.h"
  38. #include "llvm/IR/Constants.h"
  39. #include "llvm/IR/DiagnosticInfo.h"
  40. #include "llvm/IR/Dominators.h"
  41. #include "llvm/IR/Function.h"
  42. #include "llvm/IR/Instruction.h"
  43. #include "llvm/IR/Instructions.h"
  44. #include "llvm/IR/IntrinsicInst.h"
  45. #include "llvm/IR/Metadata.h"
  46. #include "llvm/IR/PassManager.h"
  47. #include "llvm/Pass.h"
  48. #include "llvm/Support/Casting.h"
  49. #include "llvm/Support/CommandLine.h"
  50. #include "llvm/Support/Debug.h"
  51. #include "llvm/Support/ErrorHandling.h"
  52. #include "llvm/Support/raw_ostream.h"
  53. #include "llvm/Transforms/Scalar.h"
  54. #include "llvm/Transforms/Scalar/LoopPassManager.h"
  55. #include "llvm/Transforms/Utils/LoopSimplify.h"
  56. #include "llvm/Transforms/Utils/LoopUtils.h"
  57. #include "llvm/Transforms/Utils/UnrollLoop.h"
  58. #include <algorithm>
  59. #include <cassert>
  60. #include <cstdint>
  61. #include <limits>
  62. #include <string>
  63. #include <tuple>
  64. #include <utility>
  65. using namespace llvm;
  66. #define DEBUG_TYPE "loop-unroll"
  67. static cl::opt<unsigned>
  68. UnrollThreshold("unroll-threshold", cl::Hidden,
  69. cl::desc("The cost threshold for loop unrolling"));
  70. static cl::opt<unsigned> UnrollPartialThreshold(
  71. "unroll-partial-threshold", cl::Hidden,
  72. cl::desc("The cost threshold for partial loop unrolling"));
  73. static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
  74. "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
  75. cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
  76. "to the threshold when aggressively unrolling a loop due to the "
  77. "dynamic cost savings. If completely unrolling a loop will reduce "
  78. "the total runtime from X to Y, we boost the loop unroll "
  79. "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
  80. "X/Y). This limit avoids excessive code bloat."));
  81. static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
  82. "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
  83. cl::desc("Don't allow loop unrolling to simulate more than this number of"
  84. "iterations when checking full unroll profitability"));
  85. static cl::opt<unsigned> UnrollCount(
  86. "unroll-count", cl::Hidden,
  87. cl::desc("Use this unroll count for all loops including those with "
  88. "unroll_count pragma values, for testing purposes"));
  89. static cl::opt<unsigned> UnrollMaxCount(
  90. "unroll-max-count", cl::Hidden,
  91. cl::desc("Set the max unroll count for partial and runtime unrolling, for"
  92. "testing purposes"));
  93. static cl::opt<unsigned> UnrollFullMaxCount(
  94. "unroll-full-max-count", cl::Hidden,
  95. cl::desc(
  96. "Set the max unroll count for full unrolling, for testing purposes"));
  97. static cl::opt<unsigned> UnrollPeelCount(
  98. "unroll-peel-count", cl::Hidden,
  99. cl::desc("Set the unroll peeling count, for testing purposes"));
  100. static cl::opt<bool>
  101. UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
  102. cl::desc("Allows loops to be partially unrolled until "
  103. "-unroll-threshold loop size is reached."));
  104. static cl::opt<bool> UnrollAllowRemainder(
  105. "unroll-allow-remainder", cl::Hidden,
  106. cl::desc("Allow generation of a loop remainder (extra iterations) "
  107. "when unrolling a loop."));
  108. static cl::opt<bool>
  109. UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
  110. cl::desc("Unroll loops with run-time trip counts"));
  111. static cl::opt<unsigned> UnrollMaxUpperBound(
  112. "unroll-max-upperbound", cl::init(8), cl::Hidden,
  113. cl::desc(
  114. "The max of trip count upper bound that is considered in unrolling"));
  115. static cl::opt<unsigned> PragmaUnrollThreshold(
  116. "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
  117. cl::desc("Unrolled size limit for loops with an unroll(full) or "
  118. "unroll_count pragma."));
  119. static cl::opt<unsigned> FlatLoopTripCountThreshold(
  120. "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
  121. cl::desc("If the runtime tripcount for the loop is lower than the "
  122. "threshold, the loop is considered as flat and will be less "
  123. "aggressively unrolled."));
  124. static cl::opt<bool>
  125. UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
  126. cl::desc("Allows loops to be peeled when the dynamic "
  127. "trip count is known to be low."));
  128. static cl::opt<bool> UnrollUnrollRemainder(
  129. "unroll-remainder", cl::Hidden,
  130. cl::desc("Allow the loop remainder to be unrolled."));
  131. // This option isn't ever intended to be enabled, it serves to allow
  132. // experiments to check the assumptions about when this kind of revisit is
  133. // necessary.
  134. static cl::opt<bool> UnrollRevisitChildLoops(
  135. "unroll-revisit-child-loops", cl::Hidden,
  136. cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
  137. "This shouldn't typically be needed as child loops (or their "
  138. "clones) were already visited."));
  139. /// A magic value for use with the Threshold parameter to indicate
  140. /// that the loop unroll should be performed regardless of how much
  141. /// code expansion would result.
  142. static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
  143. /// Gather the various unrolling parameters based on the defaults, compiler
  144. /// flags, TTI overrides and user specified parameters.
  145. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
  146. Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
  147. Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
  148. Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
  149. Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
  150. TargetTransformInfo::UnrollingPreferences UP;
  151. // Set up the defaults
  152. UP.Threshold = OptLevel > 2 ? 300 : 150;
  153. UP.MaxPercentThresholdBoost = 400;
  154. UP.OptSizeThreshold = 0;
  155. UP.PartialThreshold = 150;
  156. UP.PartialOptSizeThreshold = 0;
  157. UP.Count = 0;
  158. UP.PeelCount = 0;
  159. UP.DefaultUnrollRuntimeCount = 8;
  160. UP.MaxCount = std::numeric_limits<unsigned>::max();
  161. UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
  162. UP.BEInsns = 2;
  163. UP.Partial = false;
  164. UP.Runtime = false;
  165. UP.AllowRemainder = true;
  166. UP.UnrollRemainder = false;
  167. UP.AllowExpensiveTripCount = false;
  168. UP.Force = false;
  169. UP.UpperBound = false;
  170. UP.AllowPeeling = true;
  171. // Override with any target specific settings
  172. TTI.getUnrollingPreferences(L, SE, UP);
  173. // Apply size attributes
  174. if (L->getHeader()->getParent()->optForSize()) {
  175. UP.Threshold = UP.OptSizeThreshold;
  176. UP.PartialThreshold = UP.PartialOptSizeThreshold;
  177. }
  178. // Apply any user values specified by cl::opt
  179. if (UnrollThreshold.getNumOccurrences() > 0)
  180. UP.Threshold = UnrollThreshold;
  181. if (UnrollPartialThreshold.getNumOccurrences() > 0)
  182. UP.PartialThreshold = UnrollPartialThreshold;
  183. if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
  184. UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
  185. if (UnrollMaxCount.getNumOccurrences() > 0)
  186. UP.MaxCount = UnrollMaxCount;
  187. if (UnrollFullMaxCount.getNumOccurrences() > 0)
  188. UP.FullUnrollMaxCount = UnrollFullMaxCount;
  189. if (UnrollPeelCount.getNumOccurrences() > 0)
  190. UP.PeelCount = UnrollPeelCount;
  191. if (UnrollAllowPartial.getNumOccurrences() > 0)
  192. UP.Partial = UnrollAllowPartial;
  193. if (UnrollAllowRemainder.getNumOccurrences() > 0)
  194. UP.AllowRemainder = UnrollAllowRemainder;
  195. if (UnrollRuntime.getNumOccurrences() > 0)
  196. UP.Runtime = UnrollRuntime;
  197. if (UnrollMaxUpperBound == 0)
  198. UP.UpperBound = false;
  199. if (UnrollAllowPeeling.getNumOccurrences() > 0)
  200. UP.AllowPeeling = UnrollAllowPeeling;
  201. if (UnrollUnrollRemainder.getNumOccurrences() > 0)
  202. UP.UnrollRemainder = UnrollUnrollRemainder;
  203. // Apply user values provided by argument
  204. if (UserThreshold.hasValue()) {
  205. UP.Threshold = *UserThreshold;
  206. UP.PartialThreshold = *UserThreshold;
  207. }
  208. if (UserCount.hasValue())
  209. UP.Count = *UserCount;
  210. if (UserAllowPartial.hasValue())
  211. UP.Partial = *UserAllowPartial;
  212. if (UserRuntime.hasValue())
  213. UP.Runtime = *UserRuntime;
  214. if (UserUpperBound.hasValue())
  215. UP.UpperBound = *UserUpperBound;
  216. if (UserAllowPeeling.hasValue())
  217. UP.AllowPeeling = *UserAllowPeeling;
  218. return UP;
  219. }
  220. namespace {
  221. /// A struct to densely store the state of an instruction after unrolling at
  222. /// each iteration.
  223. ///
  224. /// This is designed to work like a tuple of <Instruction *, int> for the
  225. /// purposes of hashing and lookup, but to be able to associate two boolean
  226. /// states with each key.
  227. struct UnrolledInstState {
  228. Instruction *I;
  229. int Iteration : 30;
  230. unsigned IsFree : 1;
  231. unsigned IsCounted : 1;
  232. };
  233. /// Hashing and equality testing for a set of the instruction states.
  234. struct UnrolledInstStateKeyInfo {
  235. using PtrInfo = DenseMapInfo<Instruction *>;
  236. using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
  237. static inline UnrolledInstState getEmptyKey() {
  238. return {PtrInfo::getEmptyKey(), 0, 0, 0};
  239. }
  240. static inline UnrolledInstState getTombstoneKey() {
  241. return {PtrInfo::getTombstoneKey(), 0, 0, 0};
  242. }
  243. static inline unsigned getHashValue(const UnrolledInstState &S) {
  244. return PairInfo::getHashValue({S.I, S.Iteration});
  245. }
  246. static inline bool isEqual(const UnrolledInstState &LHS,
  247. const UnrolledInstState &RHS) {
  248. return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
  249. }
  250. };
  251. struct EstimatedUnrollCost {
  252. /// \brief The estimated cost after unrolling.
  253. unsigned UnrolledCost;
  254. /// \brief The estimated dynamic cost of executing the instructions in the
  255. /// rolled form.
  256. unsigned RolledDynamicCost;
  257. };
  258. } // end anonymous namespace
  259. /// \brief Figure out if the loop is worth full unrolling.
  260. ///
  261. /// Complete loop unrolling can make some loads constant, and we need to know
  262. /// if that would expose any further optimization opportunities. This routine
  263. /// estimates this optimization. It computes cost of unrolled loop
  264. /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
  265. /// dynamic cost we mean that we won't count costs of blocks that are known not
  266. /// to be executed (i.e. if we have a branch in the loop and we know that at the
  267. /// given iteration its condition would be resolved to true, we won't add up the
  268. /// cost of the 'false'-block).
  269. /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
  270. /// the analysis failed (no benefits expected from the unrolling, or the loop is
  271. /// too big to analyze), the returned value is None.
  272. static Optional<EstimatedUnrollCost>
  273. analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
  274. ScalarEvolution &SE, const TargetTransformInfo &TTI,
  275. unsigned MaxUnrolledLoopSize) {
  276. // We want to be able to scale offsets by the trip count and add more offsets
  277. // to them without checking for overflows, and we already don't want to
  278. // analyze *massive* trip counts, so we force the max to be reasonably small.
  279. assert(UnrollMaxIterationsCountToAnalyze <
  280. (unsigned)(std::numeric_limits<int>::max() / 2) &&
  281. "The unroll iterations max is too large!");
  282. // Only analyze inner loops. We can't properly estimate cost of nested loops
  283. // and we won't visit inner loops again anyway.
  284. if (!L->empty())
  285. return None;
  286. // Don't simulate loops with a big or unknown tripcount
  287. if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
  288. TripCount > UnrollMaxIterationsCountToAnalyze)
  289. return None;
  290. SmallSetVector<BasicBlock *, 16> BBWorklist;
  291. SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
  292. DenseMap<Value *, Constant *> SimplifiedValues;
  293. SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
  294. // The estimated cost of the unrolled form of the loop. We try to estimate
  295. // this by simplifying as much as we can while computing the estimate.
  296. unsigned UnrolledCost = 0;
  297. // We also track the estimated dynamic (that is, actually executed) cost in
  298. // the rolled form. This helps identify cases when the savings from unrolling
  299. // aren't just exposing dead control flows, but actual reduced dynamic
  300. // instructions due to the simplifications which we expect to occur after
  301. // unrolling.
  302. unsigned RolledDynamicCost = 0;
  303. // We track the simplification of each instruction in each iteration. We use
  304. // this to recursively merge costs into the unrolled cost on-demand so that
  305. // we don't count the cost of any dead code. This is essentially a map from
  306. // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
  307. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
  308. // A small worklist used to accumulate cost of instructions from each
  309. // observable and reached root in the loop.
  310. SmallVector<Instruction *, 16> CostWorklist;
  311. // PHI-used worklist used between iterations while accumulating cost.
  312. SmallVector<Instruction *, 4> PHIUsedList;
  313. // Helper function to accumulate cost for instructions in the loop.
  314. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
  315. assert(Iteration >= 0 && "Cannot have a negative iteration!");
  316. assert(CostWorklist.empty() && "Must start with an empty cost list");
  317. assert(PHIUsedList.empty() && "Must start with an empty phi used list");
  318. CostWorklist.push_back(&RootI);
  319. for (;; --Iteration) {
  320. do {
  321. Instruction *I = CostWorklist.pop_back_val();
  322. // InstCostMap only uses I and Iteration as a key, the other two values
  323. // don't matter here.
  324. auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
  325. if (CostIter == InstCostMap.end())
  326. // If an input to a PHI node comes from a dead path through the loop
  327. // we may have no cost data for it here. What that actually means is
  328. // that it is free.
  329. continue;
  330. auto &Cost = *CostIter;
  331. if (Cost.IsCounted)
  332. // Already counted this instruction.
  333. continue;
  334. // Mark that we are counting the cost of this instruction now.
  335. Cost.IsCounted = true;
  336. // If this is a PHI node in the loop header, just add it to the PHI set.
  337. if (auto *PhiI = dyn_cast<PHINode>(I))
  338. if (PhiI->getParent() == L->getHeader()) {
  339. assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
  340. "inherently simplify during unrolling.");
  341. if (Iteration == 0)
  342. continue;
  343. // Push the incoming value from the backedge into the PHI used list
  344. // if it is an in-loop instruction. We'll use this to populate the
  345. // cost worklist for the next iteration (as we count backwards).
  346. if (auto *OpI = dyn_cast<Instruction>(
  347. PhiI->getIncomingValueForBlock(L->getLoopLatch())))
  348. if (L->contains(OpI))
  349. PHIUsedList.push_back(OpI);
  350. continue;
  351. }
  352. // First accumulate the cost of this instruction.
  353. if (!Cost.IsFree) {
  354. UnrolledCost += TTI.getUserCost(I);
  355. DEBUG(dbgs() << "Adding cost of instruction (iteration " << Iteration
  356. << "): ");
  357. DEBUG(I->dump());
  358. }
  359. // We must count the cost of every operand which is not free,
  360. // recursively. If we reach a loop PHI node, simply add it to the set
  361. // to be considered on the next iteration (backwards!).
  362. for (Value *Op : I->operands()) {
  363. // Check whether this operand is free due to being a constant or
  364. // outside the loop.
  365. auto *OpI = dyn_cast<Instruction>(Op);
  366. if (!OpI || !L->contains(OpI))
  367. continue;
  368. // Otherwise accumulate its cost.
  369. CostWorklist.push_back(OpI);
  370. }
  371. } while (!CostWorklist.empty());
  372. if (PHIUsedList.empty())
  373. // We've exhausted the search.
  374. break;
  375. assert(Iteration > 0 &&
  376. "Cannot track PHI-used values past the first iteration!");
  377. CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
  378. PHIUsedList.clear();
  379. }
  380. };
  381. // Ensure that we don't violate the loop structure invariants relied on by
  382. // this analysis.
  383. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
  384. assert(L->isLCSSAForm(DT) &&
  385. "Must have loops in LCSSA form to track live-out values.");
  386. DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
  387. // Simulate execution of each iteration of the loop counting instructions,
  388. // which would be simplified.
  389. // Since the same load will take different values on different iterations,
  390. // we literally have to go through all loop's iterations.
  391. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
  392. DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
  393. // Prepare for the iteration by collecting any simplified entry or backedge
  394. // inputs.
  395. for (Instruction &I : *L->getHeader()) {
  396. auto *PHI = dyn_cast<PHINode>(&I);
  397. if (!PHI)
  398. break;
  399. // The loop header PHI nodes must have exactly two input: one from the
  400. // loop preheader and one from the loop latch.
  401. assert(
  402. PHI->getNumIncomingValues() == 2 &&
  403. "Must have an incoming value only for the preheader and the latch.");
  404. Value *V = PHI->getIncomingValueForBlock(
  405. Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
  406. Constant *C = dyn_cast<Constant>(V);
  407. if (Iteration != 0 && !C)
  408. C = SimplifiedValues.lookup(V);
  409. if (C)
  410. SimplifiedInputValues.push_back({PHI, C});
  411. }
  412. // Now clear and re-populate the map for the next iteration.
  413. SimplifiedValues.clear();
  414. while (!SimplifiedInputValues.empty())
  415. SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
  416. UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
  417. BBWorklist.clear();
  418. BBWorklist.insert(L->getHeader());
  419. // Note that we *must not* cache the size, this loop grows the worklist.
  420. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
  421. BasicBlock *BB = BBWorklist[Idx];
  422. // Visit all instructions in the given basic block and try to simplify
  423. // it. We don't change the actual IR, just count optimization
  424. // opportunities.
  425. for (Instruction &I : *BB) {
  426. if (isa<DbgInfoIntrinsic>(I))
  427. continue;
  428. // Track this instruction's expected baseline cost when executing the
  429. // rolled loop form.
  430. RolledDynamicCost += TTI.getUserCost(&I);
  431. // Visit the instruction to analyze its loop cost after unrolling,
  432. // and if the visitor returns true, mark the instruction as free after
  433. // unrolling and continue.
  434. bool IsFree = Analyzer.visit(I);
  435. bool Inserted = InstCostMap.insert({&I, (int)Iteration,
  436. (unsigned)IsFree,
  437. /*IsCounted*/ false}).second;
  438. (void)Inserted;
  439. assert(Inserted && "Cannot have a state for an unvisited instruction!");
  440. if (IsFree)
  441. continue;
  442. // Can't properly model a cost of a call.
  443. // FIXME: With a proper cost model we should be able to do it.
  444. if(isa<CallInst>(&I))
  445. return None;
  446. // If the instruction might have a side-effect recursively account for
  447. // the cost of it and all the instructions leading up to it.
  448. if (I.mayHaveSideEffects())
  449. AddCostRecursively(I, Iteration);
  450. // If unrolled body turns out to be too big, bail out.
  451. if (UnrolledCost > MaxUnrolledLoopSize) {
  452. DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
  453. << " UnrolledCost: " << UnrolledCost
  454. << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
  455. << "\n");
  456. return None;
  457. }
  458. }
  459. TerminatorInst *TI = BB->getTerminator();
  460. // Add in the live successors by first checking whether we have terminator
  461. // that may be simplified based on the values simplified by this call.
  462. BasicBlock *KnownSucc = nullptr;
  463. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  464. if (BI->isConditional()) {
  465. if (Constant *SimpleCond =
  466. SimplifiedValues.lookup(BI->getCondition())) {
  467. // Just take the first successor if condition is undef
  468. if (isa<UndefValue>(SimpleCond))
  469. KnownSucc = BI->getSuccessor(0);
  470. else if (ConstantInt *SimpleCondVal =
  471. dyn_cast<ConstantInt>(SimpleCond))
  472. KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
  473. }
  474. }
  475. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
  476. if (Constant *SimpleCond =
  477. SimplifiedValues.lookup(SI->getCondition())) {
  478. // Just take the first successor if condition is undef
  479. if (isa<UndefValue>(SimpleCond))
  480. KnownSucc = SI->getSuccessor(0);
  481. else if (ConstantInt *SimpleCondVal =
  482. dyn_cast<ConstantInt>(SimpleCond))
  483. KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
  484. }
  485. }
  486. if (KnownSucc) {
  487. if (L->contains(KnownSucc))
  488. BBWorklist.insert(KnownSucc);
  489. else
  490. ExitWorklist.insert({BB, KnownSucc});
  491. continue;
  492. }
  493. // Add BB's successors to the worklist.
  494. for (BasicBlock *Succ : successors(BB))
  495. if (L->contains(Succ))
  496. BBWorklist.insert(Succ);
  497. else
  498. ExitWorklist.insert({BB, Succ});
  499. AddCostRecursively(*TI, Iteration);
  500. }
  501. // If we found no optimization opportunities on the first iteration, we
  502. // won't find them on later ones too.
  503. if (UnrolledCost == RolledDynamicCost) {
  504. DEBUG(dbgs() << " No opportunities found.. exiting.\n"
  505. << " UnrolledCost: " << UnrolledCost << "\n");
  506. return None;
  507. }
  508. }
  509. while (!ExitWorklist.empty()) {
  510. BasicBlock *ExitingBB, *ExitBB;
  511. std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
  512. for (Instruction &I : *ExitBB) {
  513. auto *PN = dyn_cast<PHINode>(&I);
  514. if (!PN)
  515. break;
  516. Value *Op = PN->getIncomingValueForBlock(ExitingBB);
  517. if (auto *OpI = dyn_cast<Instruction>(Op))
  518. if (L->contains(OpI))
  519. AddCostRecursively(*OpI, TripCount - 1);
  520. }
  521. }
  522. DEBUG(dbgs() << "Analysis finished:\n"
  523. << "UnrolledCost: " << UnrolledCost << ", "
  524. << "RolledDynamicCost: " << RolledDynamicCost << "\n");
  525. return {{UnrolledCost, RolledDynamicCost}};
  526. }
  527. /// ApproximateLoopSize - Approximate the size of the loop.
  528. static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
  529. bool &NotDuplicatable, bool &Convergent,
  530. const TargetTransformInfo &TTI,
  531. AssumptionCache *AC, unsigned BEInsns) {
  532. SmallPtrSet<const Value *, 32> EphValues;
  533. CodeMetrics::collectEphemeralValues(L, AC, EphValues);
  534. CodeMetrics Metrics;
  535. for (BasicBlock *BB : L->blocks())
  536. Metrics.analyzeBasicBlock(BB, TTI, EphValues);
  537. NumCalls = Metrics.NumInlineCandidates;
  538. NotDuplicatable = Metrics.notDuplicatable;
  539. Convergent = Metrics.convergent;
  540. unsigned LoopSize = Metrics.NumInsts;
  541. // Don't allow an estimate of size zero. This would allows unrolling of loops
  542. // with huge iteration counts, which is a compile time problem even if it's
  543. // not a problem for code quality. Also, the code using this size may assume
  544. // that each loop has at least three instructions (likely a conditional
  545. // branch, a comparison feeding that branch, and some kind of loop increment
  546. // feeding that comparison instruction).
  547. LoopSize = std::max(LoopSize, BEInsns + 1);
  548. return LoopSize;
  549. }
  550. // Returns the loop hint metadata node with the given name (for example,
  551. // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
  552. // returned.
  553. static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
  554. if (MDNode *LoopID = L->getLoopID())
  555. return GetUnrollMetadata(LoopID, Name);
  556. return nullptr;
  557. }
  558. // Returns true if the loop has an unroll(full) pragma.
  559. static bool HasUnrollFullPragma(const Loop *L) {
  560. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
  561. }
  562. // Returns true if the loop has an unroll(enable) pragma. This metadata is used
  563. // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
  564. static bool HasUnrollEnablePragma(const Loop *L) {
  565. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
  566. }
  567. // Returns true if the loop has an unroll(disable) pragma.
  568. static bool HasUnrollDisablePragma(const Loop *L) {
  569. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.disable");
  570. }
  571. // Returns true if the loop has an runtime unroll(disable) pragma.
  572. static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
  573. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
  574. }
  575. // If loop has an unroll_count pragma return the (necessarily
  576. // positive) value from the pragma. Otherwise return 0.
  577. static unsigned UnrollCountPragmaValue(const Loop *L) {
  578. MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
  579. if (MD) {
  580. assert(MD->getNumOperands() == 2 &&
  581. "Unroll count hint metadata should have two operands.");
  582. unsigned Count =
  583. mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
  584. assert(Count >= 1 && "Unroll count must be positive.");
  585. return Count;
  586. }
  587. return 0;
  588. }
  589. // Computes the boosting factor for complete unrolling.
  590. // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
  591. // be beneficial to fully unroll the loop even if unrolledcost is large. We
  592. // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
  593. // the unroll threshold.
  594. static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
  595. unsigned MaxPercentThresholdBoost) {
  596. if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
  597. return 100;
  598. else if (Cost.UnrolledCost != 0)
  599. // The boosting factor is RolledDynamicCost / UnrolledCost
  600. return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
  601. MaxPercentThresholdBoost);
  602. else
  603. return MaxPercentThresholdBoost;
  604. }
  605. // Returns loop size estimation for unrolled loop.
  606. static uint64_t getUnrolledLoopSize(
  607. unsigned LoopSize,
  608. TargetTransformInfo::UnrollingPreferences &UP) {
  609. assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
  610. return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
  611. }
  612. // Returns true if unroll count was set explicitly.
  613. // Calculates unroll count and writes it to UP.Count.
  614. static bool computeUnrollCount(
  615. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
  616. ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount,
  617. unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize,
  618. TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
  619. // Check for explicit Count.
  620. // 1st priority is unroll count set by "unroll-count" option.
  621. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
  622. if (UserUnrollCount) {
  623. UP.Count = UnrollCount;
  624. UP.AllowExpensiveTripCount = true;
  625. UP.Force = true;
  626. if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
  627. return true;
  628. }
  629. // 2nd priority is unroll count set by pragma.
  630. unsigned PragmaCount = UnrollCountPragmaValue(L);
  631. if (PragmaCount > 0) {
  632. UP.Count = PragmaCount;
  633. UP.Runtime = true;
  634. UP.AllowExpensiveTripCount = true;
  635. UP.Force = true;
  636. if (UP.AllowRemainder &&
  637. getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
  638. return true;
  639. }
  640. bool PragmaFullUnroll = HasUnrollFullPragma(L);
  641. if (PragmaFullUnroll && TripCount != 0) {
  642. UP.Count = TripCount;
  643. if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
  644. return false;
  645. }
  646. bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
  647. bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
  648. PragmaEnableUnroll || UserUnrollCount;
  649. if (ExplicitUnroll && TripCount != 0) {
  650. // If the loop has an unrolling pragma, we want to be more aggressive with
  651. // unrolling limits. Set thresholds to at least the PragmaThreshold value
  652. // which is larger than the default limits.
  653. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
  654. UP.PartialThreshold =
  655. std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
  656. }
  657. // 3rd priority is full unroll count.
  658. // Full unroll makes sense only when TripCount or its upper bound could be
  659. // statically calculated.
  660. // Also we need to check if we exceed FullUnrollMaxCount.
  661. // If using the upper bound to unroll, TripMultiple should be set to 1 because
  662. // we do not know when loop may exit.
  663. // MaxTripCount and ExactTripCount cannot both be non zero since we only
  664. // compute the former when the latter is zero.
  665. unsigned ExactTripCount = TripCount;
  666. assert((ExactTripCount == 0 || MaxTripCount == 0) &&
  667. "ExtractTripCound and MaxTripCount cannot both be non zero.");
  668. unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
  669. UP.Count = FullUnrollTripCount;
  670. if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
  671. // When computing the unrolled size, note that BEInsns are not replicated
  672. // like the rest of the loop body.
  673. if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
  674. UseUpperBound = (MaxTripCount == FullUnrollTripCount);
  675. TripCount = FullUnrollTripCount;
  676. TripMultiple = UP.UpperBound ? 1 : TripMultiple;
  677. return ExplicitUnroll;
  678. } else {
  679. // The loop isn't that small, but we still can fully unroll it if that
  680. // helps to remove a significant number of instructions.
  681. // To check that, run additional analysis on the loop.
  682. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
  683. L, FullUnrollTripCount, DT, SE, TTI,
  684. UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
  685. unsigned Boost =
  686. getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
  687. if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
  688. UseUpperBound = (MaxTripCount == FullUnrollTripCount);
  689. TripCount = FullUnrollTripCount;
  690. TripMultiple = UP.UpperBound ? 1 : TripMultiple;
  691. return ExplicitUnroll;
  692. }
  693. }
  694. }
  695. }
  696. // 4th priority is loop peeling
  697. computePeelCount(L, LoopSize, UP, TripCount);
  698. if (UP.PeelCount) {
  699. UP.Runtime = false;
  700. UP.Count = 1;
  701. return ExplicitUnroll;
  702. }
  703. // 5th priority is partial unrolling.
  704. // Try partial unroll only when TripCount could be staticaly calculated.
  705. if (TripCount) {
  706. UP.Partial |= ExplicitUnroll;
  707. if (!UP.Partial) {
  708. DEBUG(dbgs() << " will not try to unroll partially because "
  709. << "-unroll-allow-partial not given\n");
  710. UP.Count = 0;
  711. return false;
  712. }
  713. if (UP.Count == 0)
  714. UP.Count = TripCount;
  715. if (UP.PartialThreshold != NoThreshold) {
  716. // Reduce unroll count to be modulo of TripCount for partial unrolling.
  717. if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  718. UP.Count =
  719. (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
  720. (LoopSize - UP.BEInsns);
  721. if (UP.Count > UP.MaxCount)
  722. UP.Count = UP.MaxCount;
  723. while (UP.Count != 0 && TripCount % UP.Count != 0)
  724. UP.Count--;
  725. if (UP.AllowRemainder && UP.Count <= 1) {
  726. // If there is no Count that is modulo of TripCount, set Count to
  727. // largest power-of-two factor that satisfies the threshold limit.
  728. // As we'll create fixup loop, do the type of unrolling only if
  729. // remainder loop is allowed.
  730. UP.Count = UP.DefaultUnrollRuntimeCount;
  731. while (UP.Count != 0 &&
  732. getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  733. UP.Count >>= 1;
  734. }
  735. if (UP.Count < 2) {
  736. if (PragmaEnableUnroll)
  737. ORE->emit([&]() {
  738. return OptimizationRemarkMissed(DEBUG_TYPE,
  739. "UnrollAsDirectedTooLarge",
  740. L->getStartLoc(), L->getHeader())
  741. << "Unable to unroll loop as directed by unroll(enable) "
  742. "pragma "
  743. "because unrolled size is too large.";
  744. });
  745. UP.Count = 0;
  746. }
  747. } else {
  748. UP.Count = TripCount;
  749. }
  750. if (UP.Count > UP.MaxCount)
  751. UP.Count = UP.MaxCount;
  752. if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
  753. UP.Count != TripCount)
  754. ORE->emit([&]() {
  755. return OptimizationRemarkMissed(DEBUG_TYPE,
  756. "FullUnrollAsDirectedTooLarge",
  757. L->getStartLoc(), L->getHeader())
  758. << "Unable to fully unroll loop as directed by unroll pragma "
  759. "because "
  760. "unrolled size is too large.";
  761. });
  762. return ExplicitUnroll;
  763. }
  764. assert(TripCount == 0 &&
  765. "All cases when TripCount is constant should be covered here.");
  766. if (PragmaFullUnroll)
  767. ORE->emit([&]() {
  768. return OptimizationRemarkMissed(
  769. DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
  770. L->getStartLoc(), L->getHeader())
  771. << "Unable to fully unroll loop as directed by unroll(full) "
  772. "pragma "
  773. "because loop has a runtime trip count.";
  774. });
  775. // 6th priority is runtime unrolling.
  776. // Don't unroll a runtime trip count loop when it is disabled.
  777. if (HasRuntimeUnrollDisablePragma(L)) {
  778. UP.Count = 0;
  779. return false;
  780. }
  781. // Check if the runtime trip count is too small when profile is available.
  782. if (L->getHeader()->getParent()->hasProfileData()) {
  783. if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
  784. if (*ProfileTripCount < FlatLoopTripCountThreshold)
  785. return false;
  786. else
  787. UP.AllowExpensiveTripCount = true;
  788. }
  789. }
  790. // Reduce count based on the type of unrolling and the threshold values.
  791. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
  792. if (!UP.Runtime) {
  793. DEBUG(dbgs() << " will not try to unroll loop with runtime trip count "
  794. << "-unroll-runtime not given\n");
  795. UP.Count = 0;
  796. return false;
  797. }
  798. if (UP.Count == 0)
  799. UP.Count = UP.DefaultUnrollRuntimeCount;
  800. // Reduce unroll count to be the largest power-of-two factor of
  801. // the original count which satisfies the threshold limit.
  802. while (UP.Count != 0 &&
  803. getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  804. UP.Count >>= 1;
  805. #ifndef NDEBUG
  806. unsigned OrigCount = UP.Count;
  807. #endif
  808. if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
  809. while (UP.Count != 0 && TripMultiple % UP.Count != 0)
  810. UP.Count >>= 1;
  811. DEBUG(dbgs() << "Remainder loop is restricted (that could architecture "
  812. "specific or because the loop contains a convergent "
  813. "instruction), so unroll count must divide the trip "
  814. "multiple, "
  815. << TripMultiple << ". Reducing unroll count from "
  816. << OrigCount << " to " << UP.Count << ".\n");
  817. using namespace ore;
  818. if (PragmaCount > 0 && !UP.AllowRemainder)
  819. ORE->emit([&]() {
  820. return OptimizationRemarkMissed(DEBUG_TYPE,
  821. "DifferentUnrollCountFromDirected",
  822. L->getStartLoc(), L->getHeader())
  823. << "Unable to unroll loop the number of times directed by "
  824. "unroll_count pragma because remainder loop is restricted "
  825. "(that could architecture specific or because the loop "
  826. "contains a convergent instruction) and so must have an "
  827. "unroll "
  828. "count that divides the loop trip multiple of "
  829. << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
  830. << NV("UnrollCount", UP.Count) << " time(s).";
  831. });
  832. }
  833. if (UP.Count > UP.MaxCount)
  834. UP.Count = UP.MaxCount;
  835. DEBUG(dbgs() << " partially unrolling with count: " << UP.Count << "\n");
  836. if (UP.Count < 2)
  837. UP.Count = 0;
  838. return ExplicitUnroll;
  839. }
  840. static LoopUnrollResult tryToUnrollLoop(
  841. Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
  842. const TargetTransformInfo &TTI, AssumptionCache &AC,
  843. OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
  844. Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
  845. Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
  846. Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
  847. DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName()
  848. << "] Loop %" << L->getHeader()->getName() << "\n");
  849. if (HasUnrollDisablePragma(L))
  850. return LoopUnrollResult::Unmodified;
  851. if (!L->isLoopSimplifyForm()) {
  852. DEBUG(
  853. dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
  854. return LoopUnrollResult::Unmodified;
  855. }
  856. unsigned NumInlineCandidates;
  857. bool NotDuplicatable;
  858. bool Convergent;
  859. TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
  860. L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
  861. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  862. ProvidedAllowPeeling);
  863. // Exit early if unrolling is disabled.
  864. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
  865. return LoopUnrollResult::Unmodified;
  866. unsigned LoopSize = ApproximateLoopSize(
  867. L, NumInlineCandidates, NotDuplicatable, Convergent, TTI, &AC, UP.BEInsns);
  868. DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
  869. if (NotDuplicatable) {
  870. DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
  871. << " instructions.\n");
  872. return LoopUnrollResult::Unmodified;
  873. }
  874. if (NumInlineCandidates != 0) {
  875. DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
  876. return LoopUnrollResult::Unmodified;
  877. }
  878. // Find trip count and trip multiple if count is not available
  879. unsigned TripCount = 0;
  880. unsigned MaxTripCount = 0;
  881. unsigned TripMultiple = 1;
  882. // If there are multiple exiting blocks but one of them is the latch, use the
  883. // latch for the trip count estimation. Otherwise insist on a single exiting
  884. // block for the trip count estimation.
  885. BasicBlock *ExitingBlock = L->getLoopLatch();
  886. if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
  887. ExitingBlock = L->getExitingBlock();
  888. if (ExitingBlock) {
  889. TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
  890. TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
  891. }
  892. // If the loop contains a convergent operation, the prelude we'd add
  893. // to do the first few instructions before we hit the unrolled loop
  894. // is unsafe -- it adds a control-flow dependency to the convergent
  895. // operation. Therefore restrict remainder loop (try unrollig without).
  896. //
  897. // TODO: This is quite conservative. In practice, convergent_op()
  898. // is likely to be called unconditionally in the loop. In this
  899. // case, the program would be ill-formed (on most architectures)
  900. // unless n were the same on all threads in a thread group.
  901. // Assuming n is the same on all threads, any kind of unrolling is
  902. // safe. But currently llvm's notion of convergence isn't powerful
  903. // enough to express this.
  904. if (Convergent)
  905. UP.AllowRemainder = false;
  906. // Try to find the trip count upper bound if we cannot find the exact trip
  907. // count.
  908. bool MaxOrZero = false;
  909. if (!TripCount) {
  910. MaxTripCount = SE.getSmallConstantMaxTripCount(L);
  911. MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
  912. // We can unroll by the upper bound amount if it's generally allowed or if
  913. // we know that the loop is executed either the upper bound or zero times.
  914. // (MaxOrZero unrolling keeps only the first loop test, so the number of
  915. // loop tests remains the same compared to the non-unrolled version, whereas
  916. // the generic upper bound unrolling keeps all but the last loop test so the
  917. // number of loop tests goes up which may end up being worse on targets with
  918. // constriained branch predictor resources so is controlled by an option.)
  919. // In addition we only unroll small upper bounds.
  920. if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
  921. MaxTripCount = 0;
  922. }
  923. }
  924. // computeUnrollCount() decides whether it is beneficial to use upper bound to
  925. // fully unroll the loop.
  926. bool UseUpperBound = false;
  927. bool IsCountSetExplicitly =
  928. computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount,
  929. TripMultiple, LoopSize, UP, UseUpperBound);
  930. if (!UP.Count)
  931. return LoopUnrollResult::Unmodified;
  932. // Unroll factor (Count) must be less or equal to TripCount.
  933. if (TripCount && UP.Count > TripCount)
  934. UP.Count = TripCount;
  935. // Unroll the loop.
  936. LoopUnrollResult UnrollResult = UnrollLoop(
  937. L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
  938. UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
  939. LI, &SE, &DT, &AC, &ORE, PreserveLCSSA);
  940. if (UnrollResult == LoopUnrollResult::Unmodified)
  941. return LoopUnrollResult::Unmodified;
  942. // If loop has an unroll count pragma or unrolled by explicitly set count
  943. // mark loop as unrolled to prevent unrolling beyond that requested.
  944. // If the loop was peeled, we already "used up" the profile information
  945. // we had, so we don't want to unroll or peel again.
  946. if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
  947. (IsCountSetExplicitly || UP.PeelCount))
  948. L->setLoopAlreadyUnrolled();
  949. return UnrollResult;
  950. }
  951. namespace {
  952. class LoopUnroll : public LoopPass {
  953. public:
  954. static char ID; // Pass ID, replacement for typeid
  955. int OptLevel;
  956. Optional<unsigned> ProvidedCount;
  957. Optional<unsigned> ProvidedThreshold;
  958. Optional<bool> ProvidedAllowPartial;
  959. Optional<bool> ProvidedRuntime;
  960. Optional<bool> ProvidedUpperBound;
  961. Optional<bool> ProvidedAllowPeeling;
  962. LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
  963. Optional<unsigned> Count = None,
  964. Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
  965. Optional<bool> UpperBound = None,
  966. Optional<bool> AllowPeeling = None)
  967. : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
  968. ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
  969. ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
  970. ProvidedAllowPeeling(AllowPeeling) {
  971. initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
  972. }
  973. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  974. if (skipLoop(L))
  975. return false;
  976. Function &F = *L->getHeader()->getParent();
  977. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  978. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  979. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  980. const TargetTransformInfo &TTI =
  981. getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  982. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  983. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  984. // pass. Function analyses need to be preserved across loop transformations
  985. // but ORE cannot be preserved (see comment before the pass definition).
  986. OptimizationRemarkEmitter ORE(&F);
  987. bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  988. LoopUnrollResult Result = tryToUnrollLoop(
  989. L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
  990. ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
  991. ProvidedUpperBound, ProvidedAllowPeeling);
  992. if (Result == LoopUnrollResult::FullyUnrolled)
  993. LPM.markLoopAsDeleted(*L);
  994. return Result != LoopUnrollResult::Unmodified;
  995. }
  996. /// This transformation requires natural loop information & requires that
  997. /// loop preheaders be inserted into the CFG...
  998. void getAnalysisUsage(AnalysisUsage &AU) const override {
  999. AU.addRequired<AssumptionCacheTracker>();
  1000. AU.addRequired<TargetTransformInfoWrapperPass>();
  1001. // FIXME: Loop passes are required to preserve domtree, and for now we just
  1002. // recreate dom info if anything gets unrolled.
  1003. getLoopAnalysisUsage(AU);
  1004. }
  1005. };
  1006. } // end anonymous namespace
  1007. char LoopUnroll::ID = 0;
  1008. INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1009. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  1010. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  1011. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  1012. INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1013. Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
  1014. int AllowPartial, int Runtime, int UpperBound,
  1015. int AllowPeeling) {
  1016. // TODO: It would make more sense for this function to take the optionals
  1017. // directly, but that's dangerous since it would silently break out of tree
  1018. // callers.
  1019. return new LoopUnroll(
  1020. OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
  1021. Count == -1 ? None : Optional<unsigned>(Count),
  1022. AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
  1023. Runtime == -1 ? None : Optional<bool>(Runtime),
  1024. UpperBound == -1 ? None : Optional<bool>(UpperBound),
  1025. AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
  1026. }
  1027. Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
  1028. return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
  1029. }
  1030. PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
  1031. LoopStandardAnalysisResults &AR,
  1032. LPMUpdater &Updater) {
  1033. const auto &FAM =
  1034. AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
  1035. Function *F = L.getHeader()->getParent();
  1036. auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
  1037. // FIXME: This should probably be optional rather than required.
  1038. if (!ORE)
  1039. report_fatal_error(
  1040. "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
  1041. "cached at a higher level");
  1042. // Keep track of the previous loop structure so we can identify new loops
  1043. // created by unrolling.
  1044. Loop *ParentL = L.getParentLoop();
  1045. SmallPtrSet<Loop *, 4> OldLoops;
  1046. if (ParentL)
  1047. OldLoops.insert(ParentL->begin(), ParentL->end());
  1048. else
  1049. OldLoops.insert(AR.LI.begin(), AR.LI.end());
  1050. std::string LoopName = L.getName();
  1051. bool Changed =
  1052. tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
  1053. /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
  1054. /*Threshold*/ None, /*AllowPartial*/ false,
  1055. /*Runtime*/ false, /*UpperBound*/ false,
  1056. /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
  1057. if (!Changed)
  1058. return PreservedAnalyses::all();
  1059. // The parent must not be damaged by unrolling!
  1060. #ifndef NDEBUG
  1061. if (ParentL)
  1062. ParentL->verifyLoop();
  1063. #endif
  1064. // Unrolling can do several things to introduce new loops into a loop nest:
  1065. // - Full unrolling clones child loops within the current loop but then
  1066. // removes the current loop making all of the children appear to be new
  1067. // sibling loops.
  1068. //
  1069. // When a new loop appears as a sibling loop after fully unrolling,
  1070. // its nesting structure has fundamentally changed and we want to revisit
  1071. // it to reflect that.
  1072. //
  1073. // When unrolling has removed the current loop, we need to tell the
  1074. // infrastructure that it is gone.
  1075. //
  1076. // Finally, we support a debugging/testing mode where we revisit child loops
  1077. // as well. These are not expected to require further optimizations as either
  1078. // they or the loop they were cloned from have been directly visited already.
  1079. // But the debugging mode allows us to check this assumption.
  1080. bool IsCurrentLoopValid = false;
  1081. SmallVector<Loop *, 4> SibLoops;
  1082. if (ParentL)
  1083. SibLoops.append(ParentL->begin(), ParentL->end());
  1084. else
  1085. SibLoops.append(AR.LI.begin(), AR.LI.end());
  1086. erase_if(SibLoops, [&](Loop *SibLoop) {
  1087. if (SibLoop == &L) {
  1088. IsCurrentLoopValid = true;
  1089. return true;
  1090. }
  1091. // Otherwise erase the loop from the list if it was in the old loops.
  1092. return OldLoops.count(SibLoop) != 0;
  1093. });
  1094. Updater.addSiblingLoops(SibLoops);
  1095. if (!IsCurrentLoopValid) {
  1096. Updater.markLoopAsDeleted(L, LoopName);
  1097. } else {
  1098. // We can only walk child loops if the current loop remained valid.
  1099. if (UnrollRevisitChildLoops) {
  1100. // Walk *all* of the child loops.
  1101. SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
  1102. Updater.addChildLoops(ChildLoops);
  1103. }
  1104. }
  1105. return getLoopPassPreservedAnalyses();
  1106. }
  1107. template <typename RangeT>
  1108. static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
  1109. SmallVector<Loop *, 8> Worklist;
  1110. // We use an internal worklist to build up the preorder traversal without
  1111. // recursion.
  1112. SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
  1113. for (Loop *RootL : Loops) {
  1114. assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
  1115. assert(PreOrderWorklist.empty() &&
  1116. "Must start with an empty preorder walk worklist.");
  1117. PreOrderWorklist.push_back(RootL);
  1118. do {
  1119. Loop *L = PreOrderWorklist.pop_back_val();
  1120. PreOrderWorklist.append(L->begin(), L->end());
  1121. PreOrderLoops.push_back(L);
  1122. } while (!PreOrderWorklist.empty());
  1123. Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
  1124. PreOrderLoops.clear();
  1125. }
  1126. return Worklist;
  1127. }
  1128. PreservedAnalyses LoopUnrollPass::run(Function &F,
  1129. FunctionAnalysisManager &AM) {
  1130. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  1131. auto &LI = AM.getResult<LoopAnalysis>(F);
  1132. auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  1133. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  1134. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  1135. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  1136. LoopAnalysisManager *LAM = nullptr;
  1137. if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
  1138. LAM = &LAMProxy->getManager();
  1139. const ModuleAnalysisManager &MAM =
  1140. AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
  1141. ProfileSummaryInfo *PSI =
  1142. MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
  1143. bool Changed = false;
  1144. // The unroller requires loops to be in simplified form, and also needs LCSSA.
  1145. // Since simplification may add new inner loops, it has to run before the
  1146. // legality and profitability checks. This means running the loop unroller
  1147. // will simplify all loops, regardless of whether anything end up being
  1148. // unrolled.
  1149. for (auto &L : LI) {
  1150. Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */);
  1151. Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
  1152. }
  1153. SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
  1154. while (!Worklist.empty()) {
  1155. // Because the LoopInfo stores the loops in RPO, we walk the worklist
  1156. // from back to front so that we work forward across the CFG, which
  1157. // for unrolling is only needed to get optimization remarks emitted in
  1158. // a forward order.
  1159. Loop &L = *Worklist.pop_back_val();
  1160. #ifndef NDEBUG
  1161. Loop *ParentL = L.getParentLoop();
  1162. #endif
  1163. // The API here is quite complex to call, but there are only two interesting
  1164. // states we support: partial and full (or "simple") unrolling. However, to
  1165. // enable these things we actually pass "None" in for the optional to avoid
  1166. // providing an explicit choice.
  1167. Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam,
  1168. AllowPeeling;
  1169. // Check if the profile summary indicates that the profiled application
  1170. // has a huge working set size, in which case we disable peeling to avoid
  1171. // bloating it further.
  1172. if (PSI && PSI->hasHugeWorkingSetSize())
  1173. AllowPeeling = false;
  1174. std::string LoopName = L.getName();
  1175. LoopUnrollResult Result =
  1176. tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE,
  1177. /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
  1178. /*Threshold*/ None, AllowPartialParam, RuntimeParam,
  1179. UpperBoundParam, AllowPeeling);
  1180. Changed |= Result != LoopUnrollResult::Unmodified;
  1181. // The parent must not be damaged by unrolling!
  1182. #ifndef NDEBUG
  1183. if (Result != LoopUnrollResult::Unmodified && ParentL)
  1184. ParentL->verifyLoop();
  1185. #endif
  1186. // Clear any cached analysis results for L if we removed it completely.
  1187. if (LAM && Result == LoopUnrollResult::FullyUnrolled)
  1188. LAM->clear(L, LoopName);
  1189. }
  1190. if (!Changed)
  1191. return PreservedAnalyses::all();
  1192. return getLoopPassPreservedAnalyses();
  1193. }