LoopUnrollPass.cpp 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394
  1. //===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This pass implements a simple loop unroller. It works best when loops have
  11. // been canonicalized by the -indvars pass, allowing it to determine the trip
  12. // counts of loops easily.
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
  15. #include "llvm/ADT/DenseMap.h"
  16. #include "llvm/ADT/DenseMapInfo.h"
  17. #include "llvm/ADT/DenseSet.h"
  18. #include "llvm/ADT/None.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/STLExtras.h"
  21. #include "llvm/ADT/SetVector.h"
  22. #include "llvm/ADT/SmallPtrSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/StringRef.h"
  25. #include "llvm/Analysis/AssumptionCache.h"
  26. #include "llvm/Analysis/CodeMetrics.h"
  27. #include "llvm/Analysis/LoopAnalysisManager.h"
  28. #include "llvm/Analysis/LoopInfo.h"
  29. #include "llvm/Analysis/LoopPass.h"
  30. #include "llvm/Analysis/LoopUnrollAnalyzer.h"
  31. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  32. #include "llvm/Analysis/ProfileSummaryInfo.h"
  33. #include "llvm/Analysis/ScalarEvolution.h"
  34. #include "llvm/Analysis/TargetTransformInfo.h"
  35. #include "llvm/IR/BasicBlock.h"
  36. #include "llvm/IR/CFG.h"
  37. #include "llvm/IR/Constant.h"
  38. #include "llvm/IR/Constants.h"
  39. #include "llvm/IR/DiagnosticInfo.h"
  40. #include "llvm/IR/Dominators.h"
  41. #include "llvm/IR/Function.h"
  42. #include "llvm/IR/Instruction.h"
  43. #include "llvm/IR/Instructions.h"
  44. #include "llvm/IR/IntrinsicInst.h"
  45. #include "llvm/IR/Metadata.h"
  46. #include "llvm/IR/PassManager.h"
  47. #include "llvm/Pass.h"
  48. #include "llvm/Support/Casting.h"
  49. #include "llvm/Support/CommandLine.h"
  50. #include "llvm/Support/Debug.h"
  51. #include "llvm/Support/ErrorHandling.h"
  52. #include "llvm/Support/raw_ostream.h"
  53. #include "llvm/Transforms/Scalar.h"
  54. #include "llvm/Transforms/Scalar/LoopPassManager.h"
  55. #include "llvm/Transforms/Utils.h"
  56. #include "llvm/Transforms/Utils/LoopSimplify.h"
  57. #include "llvm/Transforms/Utils/LoopUtils.h"
  58. #include "llvm/Transforms/Utils/UnrollLoop.h"
  59. #include <algorithm>
  60. #include <cassert>
  61. #include <cstdint>
  62. #include <limits>
  63. #include <string>
  64. #include <tuple>
  65. #include <utility>
  66. using namespace llvm;
  67. #define DEBUG_TYPE "loop-unroll"
  68. static cl::opt<unsigned>
  69. UnrollThreshold("unroll-threshold", cl::Hidden,
  70. cl::desc("The cost threshold for loop unrolling"));
  71. static cl::opt<unsigned> UnrollPartialThreshold(
  72. "unroll-partial-threshold", cl::Hidden,
  73. cl::desc("The cost threshold for partial loop unrolling"));
  74. static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
  75. "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
  76. cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
  77. "to the threshold when aggressively unrolling a loop due to the "
  78. "dynamic cost savings. If completely unrolling a loop will reduce "
  79. "the total runtime from X to Y, we boost the loop unroll "
  80. "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
  81. "X/Y). This limit avoids excessive code bloat."));
  82. static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
  83. "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
  84. cl::desc("Don't allow loop unrolling to simulate more than this number of"
  85. "iterations when checking full unroll profitability"));
  86. static cl::opt<unsigned> UnrollCount(
  87. "unroll-count", cl::Hidden,
  88. cl::desc("Use this unroll count for all loops including those with "
  89. "unroll_count pragma values, for testing purposes"));
  90. static cl::opt<unsigned> UnrollMaxCount(
  91. "unroll-max-count", cl::Hidden,
  92. cl::desc("Set the max unroll count for partial and runtime unrolling, for"
  93. "testing purposes"));
  94. static cl::opt<unsigned> UnrollFullMaxCount(
  95. "unroll-full-max-count", cl::Hidden,
  96. cl::desc(
  97. "Set the max unroll count for full unrolling, for testing purposes"));
  98. static cl::opt<unsigned> UnrollPeelCount(
  99. "unroll-peel-count", cl::Hidden,
  100. cl::desc("Set the unroll peeling count, for testing purposes"));
  101. static cl::opt<bool>
  102. UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
  103. cl::desc("Allows loops to be partially unrolled until "
  104. "-unroll-threshold loop size is reached."));
  105. static cl::opt<bool> UnrollAllowRemainder(
  106. "unroll-allow-remainder", cl::Hidden,
  107. cl::desc("Allow generation of a loop remainder (extra iterations) "
  108. "when unrolling a loop."));
  109. static cl::opt<bool>
  110. UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
  111. cl::desc("Unroll loops with run-time trip counts"));
  112. static cl::opt<unsigned> UnrollMaxUpperBound(
  113. "unroll-max-upperbound", cl::init(8), cl::Hidden,
  114. cl::desc(
  115. "The max of trip count upper bound that is considered in unrolling"));
  116. static cl::opt<unsigned> PragmaUnrollThreshold(
  117. "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
  118. cl::desc("Unrolled size limit for loops with an unroll(full) or "
  119. "unroll_count pragma."));
  120. static cl::opt<unsigned> FlatLoopTripCountThreshold(
  121. "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
  122. cl::desc("If the runtime tripcount for the loop is lower than the "
  123. "threshold, the loop is considered as flat and will be less "
  124. "aggressively unrolled."));
  125. static cl::opt<bool>
  126. UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
  127. cl::desc("Allows loops to be peeled when the dynamic "
  128. "trip count is known to be low."));
  129. static cl::opt<bool> UnrollUnrollRemainder(
  130. "unroll-remainder", cl::Hidden,
  131. cl::desc("Allow the loop remainder to be unrolled."));
  132. // This option isn't ever intended to be enabled, it serves to allow
  133. // experiments to check the assumptions about when this kind of revisit is
  134. // necessary.
  135. static cl::opt<bool> UnrollRevisitChildLoops(
  136. "unroll-revisit-child-loops", cl::Hidden,
  137. cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
  138. "This shouldn't typically be needed as child loops (or their "
  139. "clones) were already visited."));
  140. /// A magic value for use with the Threshold parameter to indicate
  141. /// that the loop unroll should be performed regardless of how much
  142. /// code expansion would result.
  143. static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
  144. /// Gather the various unrolling parameters based on the defaults, compiler
  145. /// flags, TTI overrides and user specified parameters.
  146. TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
  147. Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
  148. Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
  149. Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
  150. Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling) {
  151. TargetTransformInfo::UnrollingPreferences UP;
  152. // Set up the defaults
  153. UP.Threshold = OptLevel > 2 ? 300 : 150;
  154. UP.MaxPercentThresholdBoost = 400;
  155. UP.OptSizeThreshold = 0;
  156. UP.PartialThreshold = 150;
  157. UP.PartialOptSizeThreshold = 0;
  158. UP.Count = 0;
  159. UP.PeelCount = 0;
  160. UP.DefaultUnrollRuntimeCount = 8;
  161. UP.MaxCount = std::numeric_limits<unsigned>::max();
  162. UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
  163. UP.BEInsns = 2;
  164. UP.Partial = false;
  165. UP.Runtime = false;
  166. UP.AllowRemainder = true;
  167. UP.UnrollRemainder = false;
  168. UP.AllowExpensiveTripCount = false;
  169. UP.Force = false;
  170. UP.UpperBound = false;
  171. UP.AllowPeeling = true;
  172. UP.UnrollAndJam = false;
  173. UP.UnrollAndJamInnerLoopThreshold = 60;
  174. // Override with any target specific settings
  175. TTI.getUnrollingPreferences(L, SE, UP);
  176. // Apply size attributes
  177. if (L->getHeader()->getParent()->optForSize()) {
  178. UP.Threshold = UP.OptSizeThreshold;
  179. UP.PartialThreshold = UP.PartialOptSizeThreshold;
  180. }
  181. // Apply any user values specified by cl::opt
  182. if (UnrollThreshold.getNumOccurrences() > 0)
  183. UP.Threshold = UnrollThreshold;
  184. if (UnrollPartialThreshold.getNumOccurrences() > 0)
  185. UP.PartialThreshold = UnrollPartialThreshold;
  186. if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
  187. UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
  188. if (UnrollMaxCount.getNumOccurrences() > 0)
  189. UP.MaxCount = UnrollMaxCount;
  190. if (UnrollFullMaxCount.getNumOccurrences() > 0)
  191. UP.FullUnrollMaxCount = UnrollFullMaxCount;
  192. if (UnrollPeelCount.getNumOccurrences() > 0)
  193. UP.PeelCount = UnrollPeelCount;
  194. if (UnrollAllowPartial.getNumOccurrences() > 0)
  195. UP.Partial = UnrollAllowPartial;
  196. if (UnrollAllowRemainder.getNumOccurrences() > 0)
  197. UP.AllowRemainder = UnrollAllowRemainder;
  198. if (UnrollRuntime.getNumOccurrences() > 0)
  199. UP.Runtime = UnrollRuntime;
  200. if (UnrollMaxUpperBound == 0)
  201. UP.UpperBound = false;
  202. if (UnrollAllowPeeling.getNumOccurrences() > 0)
  203. UP.AllowPeeling = UnrollAllowPeeling;
  204. if (UnrollUnrollRemainder.getNumOccurrences() > 0)
  205. UP.UnrollRemainder = UnrollUnrollRemainder;
  206. // Apply user values provided by argument
  207. if (UserThreshold.hasValue()) {
  208. UP.Threshold = *UserThreshold;
  209. UP.PartialThreshold = *UserThreshold;
  210. }
  211. if (UserCount.hasValue())
  212. UP.Count = *UserCount;
  213. if (UserAllowPartial.hasValue())
  214. UP.Partial = *UserAllowPartial;
  215. if (UserRuntime.hasValue())
  216. UP.Runtime = *UserRuntime;
  217. if (UserUpperBound.hasValue())
  218. UP.UpperBound = *UserUpperBound;
  219. if (UserAllowPeeling.hasValue())
  220. UP.AllowPeeling = *UserAllowPeeling;
  221. return UP;
  222. }
  223. namespace {
  224. /// A struct to densely store the state of an instruction after unrolling at
  225. /// each iteration.
  226. ///
  227. /// This is designed to work like a tuple of <Instruction *, int> for the
  228. /// purposes of hashing and lookup, but to be able to associate two boolean
  229. /// states with each key.
  230. struct UnrolledInstState {
  231. Instruction *I;
  232. int Iteration : 30;
  233. unsigned IsFree : 1;
  234. unsigned IsCounted : 1;
  235. };
  236. /// Hashing and equality testing for a set of the instruction states.
  237. struct UnrolledInstStateKeyInfo {
  238. using PtrInfo = DenseMapInfo<Instruction *>;
  239. using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
  240. static inline UnrolledInstState getEmptyKey() {
  241. return {PtrInfo::getEmptyKey(), 0, 0, 0};
  242. }
  243. static inline UnrolledInstState getTombstoneKey() {
  244. return {PtrInfo::getTombstoneKey(), 0, 0, 0};
  245. }
  246. static inline unsigned getHashValue(const UnrolledInstState &S) {
  247. return PairInfo::getHashValue({S.I, S.Iteration});
  248. }
  249. static inline bool isEqual(const UnrolledInstState &LHS,
  250. const UnrolledInstState &RHS) {
  251. return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
  252. }
  253. };
  254. struct EstimatedUnrollCost {
  255. /// The estimated cost after unrolling.
  256. unsigned UnrolledCost;
  257. /// The estimated dynamic cost of executing the instructions in the
  258. /// rolled form.
  259. unsigned RolledDynamicCost;
  260. };
  261. } // end anonymous namespace
  262. /// Figure out if the loop is worth full unrolling.
  263. ///
  264. /// Complete loop unrolling can make some loads constant, and we need to know
  265. /// if that would expose any further optimization opportunities. This routine
  266. /// estimates this optimization. It computes cost of unrolled loop
  267. /// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
  268. /// dynamic cost we mean that we won't count costs of blocks that are known not
  269. /// to be executed (i.e. if we have a branch in the loop and we know that at the
  270. /// given iteration its condition would be resolved to true, we won't add up the
  271. /// cost of the 'false'-block).
  272. /// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
  273. /// the analysis failed (no benefits expected from the unrolling, or the loop is
  274. /// too big to analyze), the returned value is None.
  275. static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
  276. const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
  277. const SmallPtrSetImpl<const Value *> &EphValues,
  278. const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
  279. // We want to be able to scale offsets by the trip count and add more offsets
  280. // to them without checking for overflows, and we already don't want to
  281. // analyze *massive* trip counts, so we force the max to be reasonably small.
  282. assert(UnrollMaxIterationsCountToAnalyze <
  283. (unsigned)(std::numeric_limits<int>::max() / 2) &&
  284. "The unroll iterations max is too large!");
  285. // Only analyze inner loops. We can't properly estimate cost of nested loops
  286. // and we won't visit inner loops again anyway.
  287. if (!L->empty())
  288. return None;
  289. // Don't simulate loops with a big or unknown tripcount
  290. if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
  291. TripCount > UnrollMaxIterationsCountToAnalyze)
  292. return None;
  293. SmallSetVector<BasicBlock *, 16> BBWorklist;
  294. SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
  295. DenseMap<Value *, Constant *> SimplifiedValues;
  296. SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
  297. // The estimated cost of the unrolled form of the loop. We try to estimate
  298. // this by simplifying as much as we can while computing the estimate.
  299. unsigned UnrolledCost = 0;
  300. // We also track the estimated dynamic (that is, actually executed) cost in
  301. // the rolled form. This helps identify cases when the savings from unrolling
  302. // aren't just exposing dead control flows, but actual reduced dynamic
  303. // instructions due to the simplifications which we expect to occur after
  304. // unrolling.
  305. unsigned RolledDynamicCost = 0;
  306. // We track the simplification of each instruction in each iteration. We use
  307. // this to recursively merge costs into the unrolled cost on-demand so that
  308. // we don't count the cost of any dead code. This is essentially a map from
  309. // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
  310. DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
  311. // A small worklist used to accumulate cost of instructions from each
  312. // observable and reached root in the loop.
  313. SmallVector<Instruction *, 16> CostWorklist;
  314. // PHI-used worklist used between iterations while accumulating cost.
  315. SmallVector<Instruction *, 4> PHIUsedList;
  316. // Helper function to accumulate cost for instructions in the loop.
  317. auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
  318. assert(Iteration >= 0 && "Cannot have a negative iteration!");
  319. assert(CostWorklist.empty() && "Must start with an empty cost list");
  320. assert(PHIUsedList.empty() && "Must start with an empty phi used list");
  321. CostWorklist.push_back(&RootI);
  322. for (;; --Iteration) {
  323. do {
  324. Instruction *I = CostWorklist.pop_back_val();
  325. // InstCostMap only uses I and Iteration as a key, the other two values
  326. // don't matter here.
  327. auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
  328. if (CostIter == InstCostMap.end())
  329. // If an input to a PHI node comes from a dead path through the loop
  330. // we may have no cost data for it here. What that actually means is
  331. // that it is free.
  332. continue;
  333. auto &Cost = *CostIter;
  334. if (Cost.IsCounted)
  335. // Already counted this instruction.
  336. continue;
  337. // Mark that we are counting the cost of this instruction now.
  338. Cost.IsCounted = true;
  339. // If this is a PHI node in the loop header, just add it to the PHI set.
  340. if (auto *PhiI = dyn_cast<PHINode>(I))
  341. if (PhiI->getParent() == L->getHeader()) {
  342. assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
  343. "inherently simplify during unrolling.");
  344. if (Iteration == 0)
  345. continue;
  346. // Push the incoming value from the backedge into the PHI used list
  347. // if it is an in-loop instruction. We'll use this to populate the
  348. // cost worklist for the next iteration (as we count backwards).
  349. if (auto *OpI = dyn_cast<Instruction>(
  350. PhiI->getIncomingValueForBlock(L->getLoopLatch())))
  351. if (L->contains(OpI))
  352. PHIUsedList.push_back(OpI);
  353. continue;
  354. }
  355. // First accumulate the cost of this instruction.
  356. if (!Cost.IsFree) {
  357. UnrolledCost += TTI.getUserCost(I);
  358. LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
  359. << Iteration << "): ");
  360. LLVM_DEBUG(I->dump());
  361. }
  362. // We must count the cost of every operand which is not free,
  363. // recursively. If we reach a loop PHI node, simply add it to the set
  364. // to be considered on the next iteration (backwards!).
  365. for (Value *Op : I->operands()) {
  366. // Check whether this operand is free due to being a constant or
  367. // outside the loop.
  368. auto *OpI = dyn_cast<Instruction>(Op);
  369. if (!OpI || !L->contains(OpI))
  370. continue;
  371. // Otherwise accumulate its cost.
  372. CostWorklist.push_back(OpI);
  373. }
  374. } while (!CostWorklist.empty());
  375. if (PHIUsedList.empty())
  376. // We've exhausted the search.
  377. break;
  378. assert(Iteration > 0 &&
  379. "Cannot track PHI-used values past the first iteration!");
  380. CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
  381. PHIUsedList.clear();
  382. }
  383. };
  384. // Ensure that we don't violate the loop structure invariants relied on by
  385. // this analysis.
  386. assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
  387. assert(L->isLCSSAForm(DT) &&
  388. "Must have loops in LCSSA form to track live-out values.");
  389. LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
  390. // Simulate execution of each iteration of the loop counting instructions,
  391. // which would be simplified.
  392. // Since the same load will take different values on different iterations,
  393. // we literally have to go through all loop's iterations.
  394. for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
  395. LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
  396. // Prepare for the iteration by collecting any simplified entry or backedge
  397. // inputs.
  398. for (Instruction &I : *L->getHeader()) {
  399. auto *PHI = dyn_cast<PHINode>(&I);
  400. if (!PHI)
  401. break;
  402. // The loop header PHI nodes must have exactly two input: one from the
  403. // loop preheader and one from the loop latch.
  404. assert(
  405. PHI->getNumIncomingValues() == 2 &&
  406. "Must have an incoming value only for the preheader and the latch.");
  407. Value *V = PHI->getIncomingValueForBlock(
  408. Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
  409. Constant *C = dyn_cast<Constant>(V);
  410. if (Iteration != 0 && !C)
  411. C = SimplifiedValues.lookup(V);
  412. if (C)
  413. SimplifiedInputValues.push_back({PHI, C});
  414. }
  415. // Now clear and re-populate the map for the next iteration.
  416. SimplifiedValues.clear();
  417. while (!SimplifiedInputValues.empty())
  418. SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
  419. UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
  420. BBWorklist.clear();
  421. BBWorklist.insert(L->getHeader());
  422. // Note that we *must not* cache the size, this loop grows the worklist.
  423. for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
  424. BasicBlock *BB = BBWorklist[Idx];
  425. // Visit all instructions in the given basic block and try to simplify
  426. // it. We don't change the actual IR, just count optimization
  427. // opportunities.
  428. for (Instruction &I : *BB) {
  429. // These won't get into the final code - don't even try calculating the
  430. // cost for them.
  431. if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
  432. continue;
  433. // Track this instruction's expected baseline cost when executing the
  434. // rolled loop form.
  435. RolledDynamicCost += TTI.getUserCost(&I);
  436. // Visit the instruction to analyze its loop cost after unrolling,
  437. // and if the visitor returns true, mark the instruction as free after
  438. // unrolling and continue.
  439. bool IsFree = Analyzer.visit(I);
  440. bool Inserted = InstCostMap.insert({&I, (int)Iteration,
  441. (unsigned)IsFree,
  442. /*IsCounted*/ false}).second;
  443. (void)Inserted;
  444. assert(Inserted && "Cannot have a state for an unvisited instruction!");
  445. if (IsFree)
  446. continue;
  447. // Can't properly model a cost of a call.
  448. // FIXME: With a proper cost model we should be able to do it.
  449. if (auto *CI = dyn_cast<CallInst>(&I)) {
  450. const Function *Callee = CI->getCalledFunction();
  451. if (!Callee || TTI.isLoweredToCall(Callee)) {
  452. LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
  453. return None;
  454. }
  455. }
  456. // If the instruction might have a side-effect recursively account for
  457. // the cost of it and all the instructions leading up to it.
  458. if (I.mayHaveSideEffects())
  459. AddCostRecursively(I, Iteration);
  460. // If unrolled body turns out to be too big, bail out.
  461. if (UnrolledCost > MaxUnrolledLoopSize) {
  462. LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
  463. << " UnrolledCost: " << UnrolledCost
  464. << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
  465. << "\n");
  466. return None;
  467. }
  468. }
  469. Instruction *TI = BB->getTerminator();
  470. // Add in the live successors by first checking whether we have terminator
  471. // that may be simplified based on the values simplified by this call.
  472. BasicBlock *KnownSucc = nullptr;
  473. if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
  474. if (BI->isConditional()) {
  475. if (Constant *SimpleCond =
  476. SimplifiedValues.lookup(BI->getCondition())) {
  477. // Just take the first successor if condition is undef
  478. if (isa<UndefValue>(SimpleCond))
  479. KnownSucc = BI->getSuccessor(0);
  480. else if (ConstantInt *SimpleCondVal =
  481. dyn_cast<ConstantInt>(SimpleCond))
  482. KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
  483. }
  484. }
  485. } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
  486. if (Constant *SimpleCond =
  487. SimplifiedValues.lookup(SI->getCondition())) {
  488. // Just take the first successor if condition is undef
  489. if (isa<UndefValue>(SimpleCond))
  490. KnownSucc = SI->getSuccessor(0);
  491. else if (ConstantInt *SimpleCondVal =
  492. dyn_cast<ConstantInt>(SimpleCond))
  493. KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
  494. }
  495. }
  496. if (KnownSucc) {
  497. if (L->contains(KnownSucc))
  498. BBWorklist.insert(KnownSucc);
  499. else
  500. ExitWorklist.insert({BB, KnownSucc});
  501. continue;
  502. }
  503. // Add BB's successors to the worklist.
  504. for (BasicBlock *Succ : successors(BB))
  505. if (L->contains(Succ))
  506. BBWorklist.insert(Succ);
  507. else
  508. ExitWorklist.insert({BB, Succ});
  509. AddCostRecursively(*TI, Iteration);
  510. }
  511. // If we found no optimization opportunities on the first iteration, we
  512. // won't find them on later ones too.
  513. if (UnrolledCost == RolledDynamicCost) {
  514. LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
  515. << " UnrolledCost: " << UnrolledCost << "\n");
  516. return None;
  517. }
  518. }
  519. while (!ExitWorklist.empty()) {
  520. BasicBlock *ExitingBB, *ExitBB;
  521. std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
  522. for (Instruction &I : *ExitBB) {
  523. auto *PN = dyn_cast<PHINode>(&I);
  524. if (!PN)
  525. break;
  526. Value *Op = PN->getIncomingValueForBlock(ExitingBB);
  527. if (auto *OpI = dyn_cast<Instruction>(Op))
  528. if (L->contains(OpI))
  529. AddCostRecursively(*OpI, TripCount - 1);
  530. }
  531. }
  532. LLVM_DEBUG(dbgs() << "Analysis finished:\n"
  533. << "UnrolledCost: " << UnrolledCost << ", "
  534. << "RolledDynamicCost: " << RolledDynamicCost << "\n");
  535. return {{UnrolledCost, RolledDynamicCost}};
  536. }
  537. /// ApproximateLoopSize - Approximate the size of the loop.
  538. unsigned llvm::ApproximateLoopSize(
  539. const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
  540. const TargetTransformInfo &TTI,
  541. const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
  542. CodeMetrics Metrics;
  543. for (BasicBlock *BB : L->blocks())
  544. Metrics.analyzeBasicBlock(BB, TTI, EphValues);
  545. NumCalls = Metrics.NumInlineCandidates;
  546. NotDuplicatable = Metrics.notDuplicatable;
  547. Convergent = Metrics.convergent;
  548. unsigned LoopSize = Metrics.NumInsts;
  549. // Don't allow an estimate of size zero. This would allows unrolling of loops
  550. // with huge iteration counts, which is a compile time problem even if it's
  551. // not a problem for code quality. Also, the code using this size may assume
  552. // that each loop has at least three instructions (likely a conditional
  553. // branch, a comparison feeding that branch, and some kind of loop increment
  554. // feeding that comparison instruction).
  555. LoopSize = std::max(LoopSize, BEInsns + 1);
  556. return LoopSize;
  557. }
  558. // Returns the loop hint metadata node with the given name (for example,
  559. // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
  560. // returned.
  561. static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
  562. if (MDNode *LoopID = L->getLoopID())
  563. return GetUnrollMetadata(LoopID, Name);
  564. return nullptr;
  565. }
  566. // Returns true if the loop has an unroll(full) pragma.
  567. static bool HasUnrollFullPragma(const Loop *L) {
  568. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
  569. }
  570. // Returns true if the loop has an unroll(enable) pragma. This metadata is used
  571. // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
  572. static bool HasUnrollEnablePragma(const Loop *L) {
  573. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
  574. }
  575. // Returns true if the loop has an runtime unroll(disable) pragma.
  576. static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
  577. return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
  578. }
  579. // If loop has an unroll_count pragma return the (necessarily
  580. // positive) value from the pragma. Otherwise return 0.
  581. static unsigned UnrollCountPragmaValue(const Loop *L) {
  582. MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
  583. if (MD) {
  584. assert(MD->getNumOperands() == 2 &&
  585. "Unroll count hint metadata should have two operands.");
  586. unsigned Count =
  587. mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
  588. assert(Count >= 1 && "Unroll count must be positive.");
  589. return Count;
  590. }
  591. return 0;
  592. }
  593. // Computes the boosting factor for complete unrolling.
  594. // If fully unrolling the loop would save a lot of RolledDynamicCost, it would
  595. // be beneficial to fully unroll the loop even if unrolledcost is large. We
  596. // use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
  597. // the unroll threshold.
  598. static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
  599. unsigned MaxPercentThresholdBoost) {
  600. if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
  601. return 100;
  602. else if (Cost.UnrolledCost != 0)
  603. // The boosting factor is RolledDynamicCost / UnrolledCost
  604. return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
  605. MaxPercentThresholdBoost);
  606. else
  607. return MaxPercentThresholdBoost;
  608. }
  609. // Returns loop size estimation for unrolled loop.
  610. static uint64_t getUnrolledLoopSize(
  611. unsigned LoopSize,
  612. TargetTransformInfo::UnrollingPreferences &UP) {
  613. assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
  614. return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
  615. }
  616. // Returns true if unroll count was set explicitly.
  617. // Calculates unroll count and writes it to UP.Count.
  618. // Unless IgnoreUser is true, will also use metadata and command-line options
  619. // that are specific to to the LoopUnroll pass (which, for instance, are
  620. // irrelevant for the LoopUnrollAndJam pass).
  621. // FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
  622. // many LoopUnroll-specific options. The shared functionality should be
  623. // refactored into it own function.
  624. bool llvm::computeUnrollCount(
  625. Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
  626. ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
  627. OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
  628. unsigned &TripMultiple, unsigned LoopSize,
  629. TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
  630. // Check for explicit Count.
  631. // 1st priority is unroll count set by "unroll-count" option.
  632. bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
  633. if (UserUnrollCount) {
  634. UP.Count = UnrollCount;
  635. UP.AllowExpensiveTripCount = true;
  636. UP.Force = true;
  637. if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
  638. return true;
  639. }
  640. // 2nd priority is unroll count set by pragma.
  641. unsigned PragmaCount = UnrollCountPragmaValue(L);
  642. if (PragmaCount > 0) {
  643. UP.Count = PragmaCount;
  644. UP.Runtime = true;
  645. UP.AllowExpensiveTripCount = true;
  646. UP.Force = true;
  647. if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
  648. getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
  649. return true;
  650. }
  651. bool PragmaFullUnroll = HasUnrollFullPragma(L);
  652. if (PragmaFullUnroll && TripCount != 0) {
  653. UP.Count = TripCount;
  654. if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
  655. return false;
  656. }
  657. bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
  658. bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
  659. PragmaEnableUnroll || UserUnrollCount;
  660. if (ExplicitUnroll && TripCount != 0) {
  661. // If the loop has an unrolling pragma, we want to be more aggressive with
  662. // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
  663. // value which is larger than the default limits.
  664. UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
  665. UP.PartialThreshold =
  666. std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
  667. }
  668. // 3rd priority is full unroll count.
  669. // Full unroll makes sense only when TripCount or its upper bound could be
  670. // statically calculated.
  671. // Also we need to check if we exceed FullUnrollMaxCount.
  672. // If using the upper bound to unroll, TripMultiple should be set to 1 because
  673. // we do not know when loop may exit.
  674. // MaxTripCount and ExactTripCount cannot both be non zero since we only
  675. // compute the former when the latter is zero.
  676. unsigned ExactTripCount = TripCount;
  677. assert((ExactTripCount == 0 || MaxTripCount == 0) &&
  678. "ExtractTripCount and MaxTripCount cannot both be non zero.");
  679. unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount;
  680. UP.Count = FullUnrollTripCount;
  681. if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
  682. // When computing the unrolled size, note that BEInsns are not replicated
  683. // like the rest of the loop body.
  684. if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
  685. UseUpperBound = (MaxTripCount == FullUnrollTripCount);
  686. TripCount = FullUnrollTripCount;
  687. TripMultiple = UP.UpperBound ? 1 : TripMultiple;
  688. return ExplicitUnroll;
  689. } else {
  690. // The loop isn't that small, but we still can fully unroll it if that
  691. // helps to remove a significant number of instructions.
  692. // To check that, run additional analysis on the loop.
  693. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
  694. L, FullUnrollTripCount, DT, SE, EphValues, TTI,
  695. UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
  696. unsigned Boost =
  697. getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
  698. if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
  699. UseUpperBound = (MaxTripCount == FullUnrollTripCount);
  700. TripCount = FullUnrollTripCount;
  701. TripMultiple = UP.UpperBound ? 1 : TripMultiple;
  702. return ExplicitUnroll;
  703. }
  704. }
  705. }
  706. }
  707. // 4th priority is loop peeling.
  708. computePeelCount(L, LoopSize, UP, TripCount, SE);
  709. if (UP.PeelCount) {
  710. UP.Runtime = false;
  711. UP.Count = 1;
  712. return ExplicitUnroll;
  713. }
  714. // 5th priority is partial unrolling.
  715. // Try partial unroll only when TripCount could be statically calculated.
  716. if (TripCount) {
  717. UP.Partial |= ExplicitUnroll;
  718. if (!UP.Partial) {
  719. LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
  720. << "-unroll-allow-partial not given\n");
  721. UP.Count = 0;
  722. return false;
  723. }
  724. if (UP.Count == 0)
  725. UP.Count = TripCount;
  726. if (UP.PartialThreshold != NoThreshold) {
  727. // Reduce unroll count to be modulo of TripCount for partial unrolling.
  728. if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  729. UP.Count =
  730. (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
  731. (LoopSize - UP.BEInsns);
  732. if (UP.Count > UP.MaxCount)
  733. UP.Count = UP.MaxCount;
  734. while (UP.Count != 0 && TripCount % UP.Count != 0)
  735. UP.Count--;
  736. if (UP.AllowRemainder && UP.Count <= 1) {
  737. // If there is no Count that is modulo of TripCount, set Count to
  738. // largest power-of-two factor that satisfies the threshold limit.
  739. // As we'll create fixup loop, do the type of unrolling only if
  740. // remainder loop is allowed.
  741. UP.Count = UP.DefaultUnrollRuntimeCount;
  742. while (UP.Count != 0 &&
  743. getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  744. UP.Count >>= 1;
  745. }
  746. if (UP.Count < 2) {
  747. if (PragmaEnableUnroll)
  748. ORE->emit([&]() {
  749. return OptimizationRemarkMissed(DEBUG_TYPE,
  750. "UnrollAsDirectedTooLarge",
  751. L->getStartLoc(), L->getHeader())
  752. << "Unable to unroll loop as directed by unroll(enable) "
  753. "pragma "
  754. "because unrolled size is too large.";
  755. });
  756. UP.Count = 0;
  757. }
  758. } else {
  759. UP.Count = TripCount;
  760. }
  761. if (UP.Count > UP.MaxCount)
  762. UP.Count = UP.MaxCount;
  763. if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
  764. UP.Count != TripCount)
  765. ORE->emit([&]() {
  766. return OptimizationRemarkMissed(DEBUG_TYPE,
  767. "FullUnrollAsDirectedTooLarge",
  768. L->getStartLoc(), L->getHeader())
  769. << "Unable to fully unroll loop as directed by unroll pragma "
  770. "because "
  771. "unrolled size is too large.";
  772. });
  773. return ExplicitUnroll;
  774. }
  775. assert(TripCount == 0 &&
  776. "All cases when TripCount is constant should be covered here.");
  777. if (PragmaFullUnroll)
  778. ORE->emit([&]() {
  779. return OptimizationRemarkMissed(
  780. DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
  781. L->getStartLoc(), L->getHeader())
  782. << "Unable to fully unroll loop as directed by unroll(full) "
  783. "pragma "
  784. "because loop has a runtime trip count.";
  785. });
  786. // 6th priority is runtime unrolling.
  787. // Don't unroll a runtime trip count loop when it is disabled.
  788. if (HasRuntimeUnrollDisablePragma(L)) {
  789. UP.Count = 0;
  790. return false;
  791. }
  792. // Check if the runtime trip count is too small when profile is available.
  793. if (L->getHeader()->getParent()->hasProfileData()) {
  794. if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
  795. if (*ProfileTripCount < FlatLoopTripCountThreshold)
  796. return false;
  797. else
  798. UP.AllowExpensiveTripCount = true;
  799. }
  800. }
  801. // Reduce count based on the type of unrolling and the threshold values.
  802. UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
  803. if (!UP.Runtime) {
  804. LLVM_DEBUG(
  805. dbgs() << " will not try to unroll loop with runtime trip count "
  806. << "-unroll-runtime not given\n");
  807. UP.Count = 0;
  808. return false;
  809. }
  810. if (UP.Count == 0)
  811. UP.Count = UP.DefaultUnrollRuntimeCount;
  812. // Reduce unroll count to be the largest power-of-two factor of
  813. // the original count which satisfies the threshold limit.
  814. while (UP.Count != 0 &&
  815. getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
  816. UP.Count >>= 1;
  817. #ifndef NDEBUG
  818. unsigned OrigCount = UP.Count;
  819. #endif
  820. if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
  821. while (UP.Count != 0 && TripMultiple % UP.Count != 0)
  822. UP.Count >>= 1;
  823. LLVM_DEBUG(
  824. dbgs() << "Remainder loop is restricted (that could architecture "
  825. "specific or because the loop contains a convergent "
  826. "instruction), so unroll count must divide the trip "
  827. "multiple, "
  828. << TripMultiple << ". Reducing unroll count from " << OrigCount
  829. << " to " << UP.Count << ".\n");
  830. using namespace ore;
  831. if (PragmaCount > 0 && !UP.AllowRemainder)
  832. ORE->emit([&]() {
  833. return OptimizationRemarkMissed(DEBUG_TYPE,
  834. "DifferentUnrollCountFromDirected",
  835. L->getStartLoc(), L->getHeader())
  836. << "Unable to unroll loop the number of times directed by "
  837. "unroll_count pragma because remainder loop is restricted "
  838. "(that could architecture specific or because the loop "
  839. "contains a convergent instruction) and so must have an "
  840. "unroll "
  841. "count that divides the loop trip multiple of "
  842. << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
  843. << NV("UnrollCount", UP.Count) << " time(s).";
  844. });
  845. }
  846. if (UP.Count > UP.MaxCount)
  847. UP.Count = UP.MaxCount;
  848. LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
  849. << "\n");
  850. if (UP.Count < 2)
  851. UP.Count = 0;
  852. return ExplicitUnroll;
  853. }
  854. static LoopUnrollResult tryToUnrollLoop(
  855. Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
  856. const TargetTransformInfo &TTI, AssumptionCache &AC,
  857. OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, int OptLevel,
  858. Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold,
  859. Optional<bool> ProvidedAllowPartial, Optional<bool> ProvidedRuntime,
  860. Optional<bool> ProvidedUpperBound, Optional<bool> ProvidedAllowPeeling) {
  861. LLVM_DEBUG(dbgs() << "Loop Unroll: F["
  862. << L->getHeader()->getParent()->getName() << "] Loop %"
  863. << L->getHeader()->getName() << "\n");
  864. if (hasUnrollTransformation(L) & TM_Disable)
  865. return LoopUnrollResult::Unmodified;
  866. if (!L->isLoopSimplifyForm()) {
  867. LLVM_DEBUG(
  868. dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
  869. return LoopUnrollResult::Unmodified;
  870. }
  871. unsigned NumInlineCandidates;
  872. bool NotDuplicatable;
  873. bool Convergent;
  874. TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
  875. L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount,
  876. ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
  877. ProvidedAllowPeeling);
  878. // Exit early if unrolling is disabled.
  879. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
  880. return LoopUnrollResult::Unmodified;
  881. SmallPtrSet<const Value *, 32> EphValues;
  882. CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
  883. unsigned LoopSize =
  884. ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
  885. TTI, EphValues, UP.BEInsns);
  886. LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
  887. if (NotDuplicatable) {
  888. LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
  889. << " instructions.\n");
  890. return LoopUnrollResult::Unmodified;
  891. }
  892. if (NumInlineCandidates != 0) {
  893. LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
  894. return LoopUnrollResult::Unmodified;
  895. }
  896. // Find trip count and trip multiple if count is not available
  897. unsigned TripCount = 0;
  898. unsigned MaxTripCount = 0;
  899. unsigned TripMultiple = 1;
  900. // If there are multiple exiting blocks but one of them is the latch, use the
  901. // latch for the trip count estimation. Otherwise insist on a single exiting
  902. // block for the trip count estimation.
  903. BasicBlock *ExitingBlock = L->getLoopLatch();
  904. if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
  905. ExitingBlock = L->getExitingBlock();
  906. if (ExitingBlock) {
  907. TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
  908. TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
  909. }
  910. // If the loop contains a convergent operation, the prelude we'd add
  911. // to do the first few instructions before we hit the unrolled loop
  912. // is unsafe -- it adds a control-flow dependency to the convergent
  913. // operation. Therefore restrict remainder loop (try unrollig without).
  914. //
  915. // TODO: This is quite conservative. In practice, convergent_op()
  916. // is likely to be called unconditionally in the loop. In this
  917. // case, the program would be ill-formed (on most architectures)
  918. // unless n were the same on all threads in a thread group.
  919. // Assuming n is the same on all threads, any kind of unrolling is
  920. // safe. But currently llvm's notion of convergence isn't powerful
  921. // enough to express this.
  922. if (Convergent)
  923. UP.AllowRemainder = false;
  924. // Try to find the trip count upper bound if we cannot find the exact trip
  925. // count.
  926. bool MaxOrZero = false;
  927. if (!TripCount) {
  928. MaxTripCount = SE.getSmallConstantMaxTripCount(L);
  929. MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
  930. // We can unroll by the upper bound amount if it's generally allowed or if
  931. // we know that the loop is executed either the upper bound or zero times.
  932. // (MaxOrZero unrolling keeps only the first loop test, so the number of
  933. // loop tests remains the same compared to the non-unrolled version, whereas
  934. // the generic upper bound unrolling keeps all but the last loop test so the
  935. // number of loop tests goes up which may end up being worse on targets with
  936. // constrained branch predictor resources so is controlled by an option.)
  937. // In addition we only unroll small upper bounds.
  938. if (!(UP.UpperBound || MaxOrZero) || MaxTripCount > UnrollMaxUpperBound) {
  939. MaxTripCount = 0;
  940. }
  941. }
  942. // computeUnrollCount() decides whether it is beneficial to use upper bound to
  943. // fully unroll the loop.
  944. bool UseUpperBound = false;
  945. bool IsCountSetExplicitly = computeUnrollCount(
  946. L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount,
  947. TripMultiple, LoopSize, UP, UseUpperBound);
  948. if (!UP.Count)
  949. return LoopUnrollResult::Unmodified;
  950. // Unroll factor (Count) must be less or equal to TripCount.
  951. if (TripCount && UP.Count > TripCount)
  952. UP.Count = TripCount;
  953. // Save loop properties before it is transformed.
  954. MDNode *OrigLoopID = L->getLoopID();
  955. // Unroll the loop.
  956. Loop *RemainderLoop = nullptr;
  957. LoopUnrollResult UnrollResult = UnrollLoop(
  958. L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
  959. UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
  960. LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
  961. if (UnrollResult == LoopUnrollResult::Unmodified)
  962. return LoopUnrollResult::Unmodified;
  963. if (RemainderLoop) {
  964. Optional<MDNode *> RemainderLoopID =
  965. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  966. LLVMLoopUnrollFollowupRemainder});
  967. if (RemainderLoopID.hasValue())
  968. RemainderLoop->setLoopID(RemainderLoopID.getValue());
  969. }
  970. if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
  971. Optional<MDNode *> NewLoopID =
  972. makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
  973. LLVMLoopUnrollFollowupUnrolled});
  974. if (NewLoopID.hasValue()) {
  975. L->setLoopID(NewLoopID.getValue());
  976. // Do not setLoopAlreadyUnrolled if loop attributes have been specified
  977. // explicitly.
  978. return UnrollResult;
  979. }
  980. }
  981. // If loop has an unroll count pragma or unrolled by explicitly set count
  982. // mark loop as unrolled to prevent unrolling beyond that requested.
  983. // If the loop was peeled, we already "used up" the profile information
  984. // we had, so we don't want to unroll or peel again.
  985. if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
  986. (IsCountSetExplicitly || UP.PeelCount))
  987. L->setLoopAlreadyUnrolled();
  988. return UnrollResult;
  989. }
  990. namespace {
  991. class LoopUnroll : public LoopPass {
  992. public:
  993. static char ID; // Pass ID, replacement for typeid
  994. int OptLevel;
  995. Optional<unsigned> ProvidedCount;
  996. Optional<unsigned> ProvidedThreshold;
  997. Optional<bool> ProvidedAllowPartial;
  998. Optional<bool> ProvidedRuntime;
  999. Optional<bool> ProvidedUpperBound;
  1000. Optional<bool> ProvidedAllowPeeling;
  1001. LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
  1002. Optional<unsigned> Count = None,
  1003. Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
  1004. Optional<bool> UpperBound = None,
  1005. Optional<bool> AllowPeeling = None)
  1006. : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
  1007. ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
  1008. ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
  1009. ProvidedAllowPeeling(AllowPeeling) {
  1010. initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
  1011. }
  1012. bool runOnLoop(Loop *L, LPPassManager &LPM) override {
  1013. if (skipLoop(L))
  1014. return false;
  1015. Function &F = *L->getHeader()->getParent();
  1016. auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1017. LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  1018. ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  1019. const TargetTransformInfo &TTI =
  1020. getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  1021. auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  1022. // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
  1023. // pass. Function analyses need to be preserved across loop transformations
  1024. // but ORE cannot be preserved (see comment before the pass definition).
  1025. OptimizationRemarkEmitter ORE(&F);
  1026. bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
  1027. LoopUnrollResult Result = tryToUnrollLoop(
  1028. L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount,
  1029. ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime,
  1030. ProvidedUpperBound, ProvidedAllowPeeling);
  1031. if (Result == LoopUnrollResult::FullyUnrolled)
  1032. LPM.markLoopAsDeleted(*L);
  1033. return Result != LoopUnrollResult::Unmodified;
  1034. }
  1035. /// This transformation requires natural loop information & requires that
  1036. /// loop preheaders be inserted into the CFG...
  1037. void getAnalysisUsage(AnalysisUsage &AU) const override {
  1038. AU.addRequired<AssumptionCacheTracker>();
  1039. AU.addRequired<TargetTransformInfoWrapperPass>();
  1040. // FIXME: Loop passes are required to preserve domtree, and for now we just
  1041. // recreate dom info if anything gets unrolled.
  1042. getLoopAnalysisUsage(AU);
  1043. }
  1044. };
  1045. } // end anonymous namespace
  1046. char LoopUnroll::ID = 0;
  1047. INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1048. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  1049. INITIALIZE_PASS_DEPENDENCY(LoopPass)
  1050. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  1051. INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
  1052. Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
  1053. int AllowPartial, int Runtime, int UpperBound,
  1054. int AllowPeeling) {
  1055. // TODO: It would make more sense for this function to take the optionals
  1056. // directly, but that's dangerous since it would silently break out of tree
  1057. // callers.
  1058. return new LoopUnroll(
  1059. OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
  1060. Count == -1 ? None : Optional<unsigned>(Count),
  1061. AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
  1062. Runtime == -1 ? None : Optional<bool>(Runtime),
  1063. UpperBound == -1 ? None : Optional<bool>(UpperBound),
  1064. AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
  1065. }
  1066. Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
  1067. return createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0, 0);
  1068. }
  1069. PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
  1070. LoopStandardAnalysisResults &AR,
  1071. LPMUpdater &Updater) {
  1072. const auto &FAM =
  1073. AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
  1074. Function *F = L.getHeader()->getParent();
  1075. auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
  1076. // FIXME: This should probably be optional rather than required.
  1077. if (!ORE)
  1078. report_fatal_error(
  1079. "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
  1080. "cached at a higher level");
  1081. // Keep track of the previous loop structure so we can identify new loops
  1082. // created by unrolling.
  1083. Loop *ParentL = L.getParentLoop();
  1084. SmallPtrSet<Loop *, 4> OldLoops;
  1085. if (ParentL)
  1086. OldLoops.insert(ParentL->begin(), ParentL->end());
  1087. else
  1088. OldLoops.insert(AR.LI.begin(), AR.LI.end());
  1089. std::string LoopName = L.getName();
  1090. bool Changed =
  1091. tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
  1092. /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
  1093. /*Threshold*/ None, /*AllowPartial*/ false,
  1094. /*Runtime*/ false, /*UpperBound*/ false,
  1095. /*AllowPeeling*/ false) != LoopUnrollResult::Unmodified;
  1096. if (!Changed)
  1097. return PreservedAnalyses::all();
  1098. // The parent must not be damaged by unrolling!
  1099. #ifndef NDEBUG
  1100. if (ParentL)
  1101. ParentL->verifyLoop();
  1102. #endif
  1103. // Unrolling can do several things to introduce new loops into a loop nest:
  1104. // - Full unrolling clones child loops within the current loop but then
  1105. // removes the current loop making all of the children appear to be new
  1106. // sibling loops.
  1107. //
  1108. // When a new loop appears as a sibling loop after fully unrolling,
  1109. // its nesting structure has fundamentally changed and we want to revisit
  1110. // it to reflect that.
  1111. //
  1112. // When unrolling has removed the current loop, we need to tell the
  1113. // infrastructure that it is gone.
  1114. //
  1115. // Finally, we support a debugging/testing mode where we revisit child loops
  1116. // as well. These are not expected to require further optimizations as either
  1117. // they or the loop they were cloned from have been directly visited already.
  1118. // But the debugging mode allows us to check this assumption.
  1119. bool IsCurrentLoopValid = false;
  1120. SmallVector<Loop *, 4> SibLoops;
  1121. if (ParentL)
  1122. SibLoops.append(ParentL->begin(), ParentL->end());
  1123. else
  1124. SibLoops.append(AR.LI.begin(), AR.LI.end());
  1125. erase_if(SibLoops, [&](Loop *SibLoop) {
  1126. if (SibLoop == &L) {
  1127. IsCurrentLoopValid = true;
  1128. return true;
  1129. }
  1130. // Otherwise erase the loop from the list if it was in the old loops.
  1131. return OldLoops.count(SibLoop) != 0;
  1132. });
  1133. Updater.addSiblingLoops(SibLoops);
  1134. if (!IsCurrentLoopValid) {
  1135. Updater.markLoopAsDeleted(L, LoopName);
  1136. } else {
  1137. // We can only walk child loops if the current loop remained valid.
  1138. if (UnrollRevisitChildLoops) {
  1139. // Walk *all* of the child loops.
  1140. SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
  1141. Updater.addChildLoops(ChildLoops);
  1142. }
  1143. }
  1144. return getLoopPassPreservedAnalyses();
  1145. }
  1146. template <typename RangeT>
  1147. static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
  1148. SmallVector<Loop *, 8> Worklist;
  1149. // We use an internal worklist to build up the preorder traversal without
  1150. // recursion.
  1151. SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
  1152. for (Loop *RootL : Loops) {
  1153. assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
  1154. assert(PreOrderWorklist.empty() &&
  1155. "Must start with an empty preorder walk worklist.");
  1156. PreOrderWorklist.push_back(RootL);
  1157. do {
  1158. Loop *L = PreOrderWorklist.pop_back_val();
  1159. PreOrderWorklist.append(L->begin(), L->end());
  1160. PreOrderLoops.push_back(L);
  1161. } while (!PreOrderWorklist.empty());
  1162. Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
  1163. PreOrderLoops.clear();
  1164. }
  1165. return Worklist;
  1166. }
  1167. PreservedAnalyses LoopUnrollPass::run(Function &F,
  1168. FunctionAnalysisManager &AM) {
  1169. auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
  1170. auto &LI = AM.getResult<LoopAnalysis>(F);
  1171. auto &TTI = AM.getResult<TargetIRAnalysis>(F);
  1172. auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
  1173. auto &AC = AM.getResult<AssumptionAnalysis>(F);
  1174. auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  1175. LoopAnalysisManager *LAM = nullptr;
  1176. if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
  1177. LAM = &LAMProxy->getManager();
  1178. const ModuleAnalysisManager &MAM =
  1179. AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
  1180. ProfileSummaryInfo *PSI =
  1181. MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
  1182. bool Changed = false;
  1183. // The unroller requires loops to be in simplified form, and also needs LCSSA.
  1184. // Since simplification may add new inner loops, it has to run before the
  1185. // legality and profitability checks. This means running the loop unroller
  1186. // will simplify all loops, regardless of whether anything end up being
  1187. // unrolled.
  1188. for (auto &L : LI) {
  1189. Changed |= simplifyLoop(L, &DT, &LI, &SE, &AC, false /* PreserveLCSSA */);
  1190. Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
  1191. }
  1192. SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
  1193. while (!Worklist.empty()) {
  1194. // Because the LoopInfo stores the loops in RPO, we walk the worklist
  1195. // from back to front so that we work forward across the CFG, which
  1196. // for unrolling is only needed to get optimization remarks emitted in
  1197. // a forward order.
  1198. Loop &L = *Worklist.pop_back_val();
  1199. #ifndef NDEBUG
  1200. Loop *ParentL = L.getParentLoop();
  1201. #endif
  1202. // Check if the profile summary indicates that the profiled application
  1203. // has a huge working set size, in which case we disable peeling to avoid
  1204. // bloating it further.
  1205. Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
  1206. if (PSI && PSI->hasHugeWorkingSetSize())
  1207. LocalAllowPeeling = false;
  1208. std::string LoopName = L.getName();
  1209. // The API here is quite complex to call and we allow to select some
  1210. // flavors of unrolling during construction time (by setting UnrollOpts).
  1211. LoopUnrollResult Result = tryToUnrollLoop(
  1212. &L, DT, &LI, SE, TTI, AC, ORE,
  1213. /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None,
  1214. /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
  1215. UnrollOpts.AllowUpperBound, LocalAllowPeeling);
  1216. Changed |= Result != LoopUnrollResult::Unmodified;
  1217. // The parent must not be damaged by unrolling!
  1218. #ifndef NDEBUG
  1219. if (Result != LoopUnrollResult::Unmodified && ParentL)
  1220. ParentL->verifyLoop();
  1221. #endif
  1222. // Clear any cached analysis results for L if we removed it completely.
  1223. if (LAM && Result == LoopUnrollResult::FullyUnrolled)
  1224. LAM->clear(L, LoopName);
  1225. }
  1226. if (!Changed)
  1227. return PreservedAnalyses::all();
  1228. return getLoopPassPreservedAnalyses();
  1229. }