LoopAccessAnalysis.cpp 91 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397
  1. //===- LoopAccessAnalysis.cpp - Loop Access Analysis Implementation --------==//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The implementation for the loop memory dependence that was originally
  10. // developed for the loop vectorizer.
  11. //
  12. //===----------------------------------------------------------------------===//
  13. #include "llvm/Analysis/LoopAccessAnalysis.h"
  14. #include "llvm/ADT/APInt.h"
  15. #include "llvm/ADT/DenseMap.h"
  16. #include "llvm/ADT/DepthFirstIterator.h"
  17. #include "llvm/ADT/EquivalenceClasses.h"
  18. #include "llvm/ADT/PointerIntPair.h"
  19. #include "llvm/ADT/STLExtras.h"
  20. #include "llvm/ADT/SetVector.h"
  21. #include "llvm/ADT/SmallPtrSet.h"
  22. #include "llvm/ADT/SmallSet.h"
  23. #include "llvm/ADT/SmallVector.h"
  24. #include "llvm/ADT/iterator_range.h"
  25. #include "llvm/Analysis/AliasAnalysis.h"
  26. #include "llvm/Analysis/AliasSetTracker.h"
  27. #include "llvm/Analysis/LoopAnalysisManager.h"
  28. #include "llvm/Analysis/LoopInfo.h"
  29. #include "llvm/Analysis/MemoryLocation.h"
  30. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  31. #include "llvm/Analysis/ScalarEvolution.h"
  32. #include "llvm/Analysis/ScalarEvolutionExpander.h"
  33. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  34. #include "llvm/Analysis/TargetLibraryInfo.h"
  35. #include "llvm/Analysis/ValueTracking.h"
  36. #include "llvm/Analysis/VectorUtils.h"
  37. #include "llvm/IR/BasicBlock.h"
  38. #include "llvm/IR/Constants.h"
  39. #include "llvm/IR/DataLayout.h"
  40. #include "llvm/IR/DebugLoc.h"
  41. #include "llvm/IR/DerivedTypes.h"
  42. #include "llvm/IR/DiagnosticInfo.h"
  43. #include "llvm/IR/Dominators.h"
  44. #include "llvm/IR/Function.h"
  45. #include "llvm/IR/IRBuilder.h"
  46. #include "llvm/IR/InstrTypes.h"
  47. #include "llvm/IR/Instruction.h"
  48. #include "llvm/IR/Instructions.h"
  49. #include "llvm/IR/Operator.h"
  50. #include "llvm/IR/PassManager.h"
  51. #include "llvm/IR/Type.h"
  52. #include "llvm/IR/Value.h"
  53. #include "llvm/IR/ValueHandle.h"
  54. #include "llvm/Pass.h"
  55. #include "llvm/Support/Casting.h"
  56. #include "llvm/Support/CommandLine.h"
  57. #include "llvm/Support/Debug.h"
  58. #include "llvm/Support/ErrorHandling.h"
  59. #include "llvm/Support/raw_ostream.h"
  60. #include <algorithm>
  61. #include <cassert>
  62. #include <cstdint>
  63. #include <cstdlib>
  64. #include <iterator>
  65. #include <utility>
  66. #include <vector>
  67. using namespace llvm;
  68. #define DEBUG_TYPE "loop-accesses"
  69. static cl::opt<unsigned, true>
  70. VectorizationFactor("force-vector-width", cl::Hidden,
  71. cl::desc("Sets the SIMD width. Zero is autoselect."),
  72. cl::location(VectorizerParams::VectorizationFactor));
  73. unsigned VectorizerParams::VectorizationFactor;
  74. static cl::opt<unsigned, true>
  75. VectorizationInterleave("force-vector-interleave", cl::Hidden,
  76. cl::desc("Sets the vectorization interleave count. "
  77. "Zero is autoselect."),
  78. cl::location(
  79. VectorizerParams::VectorizationInterleave));
  80. unsigned VectorizerParams::VectorizationInterleave;
  81. static cl::opt<unsigned, true> RuntimeMemoryCheckThreshold(
  82. "runtime-memory-check-threshold", cl::Hidden,
  83. cl::desc("When performing memory disambiguation checks at runtime do not "
  84. "generate more than this number of comparisons (default = 8)."),
  85. cl::location(VectorizerParams::RuntimeMemoryCheckThreshold), cl::init(8));
  86. unsigned VectorizerParams::RuntimeMemoryCheckThreshold;
  87. /// The maximum iterations used to merge memory checks
  88. static cl::opt<unsigned> MemoryCheckMergeThreshold(
  89. "memory-check-merge-threshold", cl::Hidden,
  90. cl::desc("Maximum number of comparisons done when trying to merge "
  91. "runtime memory checks. (default = 100)"),
  92. cl::init(100));
  93. /// Maximum SIMD width.
  94. const unsigned VectorizerParams::MaxVectorWidth = 64;
  95. /// We collect dependences up to this threshold.
  96. static cl::opt<unsigned>
  97. MaxDependences("max-dependences", cl::Hidden,
  98. cl::desc("Maximum number of dependences collected by "
  99. "loop-access analysis (default = 100)"),
  100. cl::init(100));
  101. /// This enables versioning on the strides of symbolically striding memory
  102. /// accesses in code like the following.
  103. /// for (i = 0; i < N; ++i)
  104. /// A[i * Stride1] += B[i * Stride2] ...
  105. ///
  106. /// Will be roughly translated to
  107. /// if (Stride1 == 1 && Stride2 == 1) {
  108. /// for (i = 0; i < N; i+=4)
  109. /// A[i:i+3] += ...
  110. /// } else
  111. /// ...
  112. static cl::opt<bool> EnableMemAccessVersioning(
  113. "enable-mem-access-versioning", cl::init(true), cl::Hidden,
  114. cl::desc("Enable symbolic stride memory access versioning"));
  115. /// Enable store-to-load forwarding conflict detection. This option can
  116. /// be disabled for correctness testing.
  117. static cl::opt<bool> EnableForwardingConflictDetection(
  118. "store-to-load-forwarding-conflict-detection", cl::Hidden,
  119. cl::desc("Enable conflict detection in loop-access analysis"),
  120. cl::init(true));
  121. bool VectorizerParams::isInterleaveForced() {
  122. return ::VectorizationInterleave.getNumOccurrences() > 0;
  123. }
  124. Value *llvm::stripIntegerCast(Value *V) {
  125. if (auto *CI = dyn_cast<CastInst>(V))
  126. if (CI->getOperand(0)->getType()->isIntegerTy())
  127. return CI->getOperand(0);
  128. return V;
  129. }
  130. const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
  131. const ValueToValueMap &PtrToStride,
  132. Value *Ptr, Value *OrigPtr) {
  133. const SCEV *OrigSCEV = PSE.getSCEV(Ptr);
  134. // If there is an entry in the map return the SCEV of the pointer with the
  135. // symbolic stride replaced by one.
  136. ValueToValueMap::const_iterator SI =
  137. PtrToStride.find(OrigPtr ? OrigPtr : Ptr);
  138. if (SI != PtrToStride.end()) {
  139. Value *StrideVal = SI->second;
  140. // Strip casts.
  141. StrideVal = stripIntegerCast(StrideVal);
  142. ScalarEvolution *SE = PSE.getSE();
  143. const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
  144. const auto *CT =
  145. static_cast<const SCEVConstant *>(SE->getOne(StrideVal->getType()));
  146. PSE.addPredicate(*SE->getEqualPredicate(U, CT));
  147. auto *Expr = PSE.getSCEV(Ptr);
  148. LLVM_DEBUG(dbgs() << "LAA: Replacing SCEV: " << *OrigSCEV
  149. << " by: " << *Expr << "\n");
  150. return Expr;
  151. }
  152. // Otherwise, just return the SCEV of the original pointer.
  153. return OrigSCEV;
  154. }
  155. /// Calculate Start and End points of memory access.
  156. /// Let's assume A is the first access and B is a memory access on N-th loop
  157. /// iteration. Then B is calculated as:
  158. /// B = A + Step*N .
  159. /// Step value may be positive or negative.
  160. /// N is a calculated back-edge taken count:
  161. /// N = (TripCount > 0) ? RoundDown(TripCount -1 , VF) : 0
  162. /// Start and End points are calculated in the following way:
  163. /// Start = UMIN(A, B) ; End = UMAX(A, B) + SizeOfElt,
  164. /// where SizeOfElt is the size of single memory access in bytes.
  165. ///
  166. /// There is no conflict when the intervals are disjoint:
  167. /// NoConflict = (P2.Start >= P1.End) || (P1.Start >= P2.End)
  168. void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
  169. unsigned DepSetId, unsigned ASId,
  170. const ValueToValueMap &Strides,
  171. PredicatedScalarEvolution &PSE) {
  172. // Get the stride replaced scev.
  173. const SCEV *Sc = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
  174. ScalarEvolution *SE = PSE.getSE();
  175. const SCEV *ScStart;
  176. const SCEV *ScEnd;
  177. if (SE->isLoopInvariant(Sc, Lp))
  178. ScStart = ScEnd = Sc;
  179. else {
  180. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
  181. assert(AR && "Invalid addrec expression");
  182. const SCEV *Ex = PSE.getBackedgeTakenCount();
  183. ScStart = AR->getStart();
  184. ScEnd = AR->evaluateAtIteration(Ex, *SE);
  185. const SCEV *Step = AR->getStepRecurrence(*SE);
  186. // For expressions with negative step, the upper bound is ScStart and the
  187. // lower bound is ScEnd.
  188. if (const auto *CStep = dyn_cast<SCEVConstant>(Step)) {
  189. if (CStep->getValue()->isNegative())
  190. std::swap(ScStart, ScEnd);
  191. } else {
  192. // Fallback case: the step is not constant, but we can still
  193. // get the upper and lower bounds of the interval by using min/max
  194. // expressions.
  195. ScStart = SE->getUMinExpr(ScStart, ScEnd);
  196. ScEnd = SE->getUMaxExpr(AR->getStart(), ScEnd);
  197. }
  198. // Add the size of the pointed element to ScEnd.
  199. unsigned EltSize =
  200. Ptr->getType()->getPointerElementType()->getScalarSizeInBits() / 8;
  201. const SCEV *EltSizeSCEV = SE->getConstant(ScEnd->getType(), EltSize);
  202. ScEnd = SE->getAddExpr(ScEnd, EltSizeSCEV);
  203. }
  204. Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
  205. }
  206. SmallVector<RuntimePointerChecking::PointerCheck, 4>
  207. RuntimePointerChecking::generateChecks() const {
  208. SmallVector<PointerCheck, 4> Checks;
  209. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  210. for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
  211. const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
  212. const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
  213. if (needsChecking(CGI, CGJ))
  214. Checks.push_back(std::make_pair(&CGI, &CGJ));
  215. }
  216. }
  217. return Checks;
  218. }
  219. void RuntimePointerChecking::generateChecks(
  220. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  221. assert(Checks.empty() && "Checks is not empty");
  222. groupChecks(DepCands, UseDependencies);
  223. Checks = generateChecks();
  224. }
  225. bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
  226. const CheckingPtrGroup &N) const {
  227. for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
  228. for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
  229. if (needsChecking(M.Members[I], N.Members[J]))
  230. return true;
  231. return false;
  232. }
  233. /// Compare \p I and \p J and return the minimum.
  234. /// Return nullptr in case we couldn't find an answer.
  235. static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
  236. ScalarEvolution *SE) {
  237. const SCEV *Diff = SE->getMinusSCEV(J, I);
  238. const SCEVConstant *C = dyn_cast<const SCEVConstant>(Diff);
  239. if (!C)
  240. return nullptr;
  241. if (C->getValue()->isNegative())
  242. return J;
  243. return I;
  244. }
  245. bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
  246. const SCEV *Start = RtCheck.Pointers[Index].Start;
  247. const SCEV *End = RtCheck.Pointers[Index].End;
  248. // Compare the starts and ends with the known minimum and maximum
  249. // of this set. We need to know how we compare against the min/max
  250. // of the set in order to be able to emit memchecks.
  251. const SCEV *Min0 = getMinFromExprs(Start, Low, RtCheck.SE);
  252. if (!Min0)
  253. return false;
  254. const SCEV *Min1 = getMinFromExprs(End, High, RtCheck.SE);
  255. if (!Min1)
  256. return false;
  257. // Update the low bound expression if we've found a new min value.
  258. if (Min0 == Start)
  259. Low = Start;
  260. // Update the high bound expression if we've found a new max value.
  261. if (Min1 != End)
  262. High = End;
  263. Members.push_back(Index);
  264. return true;
  265. }
  266. void RuntimePointerChecking::groupChecks(
  267. MemoryDepChecker::DepCandidates &DepCands, bool UseDependencies) {
  268. // We build the groups from dependency candidates equivalence classes
  269. // because:
  270. // - We know that pointers in the same equivalence class share
  271. // the same underlying object and therefore there is a chance
  272. // that we can compare pointers
  273. // - We wouldn't be able to merge two pointers for which we need
  274. // to emit a memcheck. The classes in DepCands are already
  275. // conveniently built such that no two pointers in the same
  276. // class need checking against each other.
  277. // We use the following (greedy) algorithm to construct the groups
  278. // For every pointer in the equivalence class:
  279. // For each existing group:
  280. // - if the difference between this pointer and the min/max bounds
  281. // of the group is a constant, then make the pointer part of the
  282. // group and update the min/max bounds of that group as required.
  283. CheckingGroups.clear();
  284. // If we need to check two pointers to the same underlying object
  285. // with a non-constant difference, we shouldn't perform any pointer
  286. // grouping with those pointers. This is because we can easily get
  287. // into cases where the resulting check would return false, even when
  288. // the accesses are safe.
  289. //
  290. // The following example shows this:
  291. // for (i = 0; i < 1000; ++i)
  292. // a[5000 + i * m] = a[i] + a[i + 9000]
  293. //
  294. // Here grouping gives a check of (5000, 5000 + 1000 * m) against
  295. // (0, 10000) which is always false. However, if m is 1, there is no
  296. // dependence. Not grouping the checks for a[i] and a[i + 9000] allows
  297. // us to perform an accurate check in this case.
  298. //
  299. // The above case requires that we have an UnknownDependence between
  300. // accesses to the same underlying object. This cannot happen unless
  301. // FoundNonConstantDistanceDependence is set, and therefore UseDependencies
  302. // is also false. In this case we will use the fallback path and create
  303. // separate checking groups for all pointers.
  304. // If we don't have the dependency partitions, construct a new
  305. // checking pointer group for each pointer. This is also required
  306. // for correctness, because in this case we can have checking between
  307. // pointers to the same underlying object.
  308. if (!UseDependencies) {
  309. for (unsigned I = 0; I < Pointers.size(); ++I)
  310. CheckingGroups.push_back(CheckingPtrGroup(I, *this));
  311. return;
  312. }
  313. unsigned TotalComparisons = 0;
  314. DenseMap<Value *, unsigned> PositionMap;
  315. for (unsigned Index = 0; Index < Pointers.size(); ++Index)
  316. PositionMap[Pointers[Index].PointerValue] = Index;
  317. // We need to keep track of what pointers we've already seen so we
  318. // don't process them twice.
  319. SmallSet<unsigned, 2> Seen;
  320. // Go through all equivalence classes, get the "pointer check groups"
  321. // and add them to the overall solution. We use the order in which accesses
  322. // appear in 'Pointers' to enforce determinism.
  323. for (unsigned I = 0; I < Pointers.size(); ++I) {
  324. // We've seen this pointer before, and therefore already processed
  325. // its equivalence class.
  326. if (Seen.count(I))
  327. continue;
  328. MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
  329. Pointers[I].IsWritePtr);
  330. SmallVector<CheckingPtrGroup, 2> Groups;
  331. auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
  332. // Because DepCands is constructed by visiting accesses in the order in
  333. // which they appear in alias sets (which is deterministic) and the
  334. // iteration order within an equivalence class member is only dependent on
  335. // the order in which unions and insertions are performed on the
  336. // equivalence class, the iteration order is deterministic.
  337. for (auto MI = DepCands.member_begin(LeaderI), ME = DepCands.member_end();
  338. MI != ME; ++MI) {
  339. unsigned Pointer = PositionMap[MI->getPointer()];
  340. bool Merged = false;
  341. // Mark this pointer as seen.
  342. Seen.insert(Pointer);
  343. // Go through all the existing sets and see if we can find one
  344. // which can include this pointer.
  345. for (CheckingPtrGroup &Group : Groups) {
  346. // Don't perform more than a certain amount of comparisons.
  347. // This should limit the cost of grouping the pointers to something
  348. // reasonable. If we do end up hitting this threshold, the algorithm
  349. // will create separate groups for all remaining pointers.
  350. if (TotalComparisons > MemoryCheckMergeThreshold)
  351. break;
  352. TotalComparisons++;
  353. if (Group.addPointer(Pointer)) {
  354. Merged = true;
  355. break;
  356. }
  357. }
  358. if (!Merged)
  359. // We couldn't add this pointer to any existing set or the threshold
  360. // for the number of comparisons has been reached. Create a new group
  361. // to hold the current pointer.
  362. Groups.push_back(CheckingPtrGroup(Pointer, *this));
  363. }
  364. // We've computed the grouped checks for this partition.
  365. // Save the results and continue with the next one.
  366. llvm::copy(Groups, std::back_inserter(CheckingGroups));
  367. }
  368. }
  369. bool RuntimePointerChecking::arePointersInSamePartition(
  370. const SmallVectorImpl<int> &PtrToPartition, unsigned PtrIdx1,
  371. unsigned PtrIdx2) {
  372. return (PtrToPartition[PtrIdx1] != -1 &&
  373. PtrToPartition[PtrIdx1] == PtrToPartition[PtrIdx2]);
  374. }
  375. bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
  376. const PointerInfo &PointerI = Pointers[I];
  377. const PointerInfo &PointerJ = Pointers[J];
  378. // No need to check if two readonly pointers intersect.
  379. if (!PointerI.IsWritePtr && !PointerJ.IsWritePtr)
  380. return false;
  381. // Only need to check pointers between two different dependency sets.
  382. if (PointerI.DependencySetId == PointerJ.DependencySetId)
  383. return false;
  384. // Only need to check pointers in the same alias set.
  385. if (PointerI.AliasSetId != PointerJ.AliasSetId)
  386. return false;
  387. return true;
  388. }
  389. void RuntimePointerChecking::printChecks(
  390. raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
  391. unsigned Depth) const {
  392. unsigned N = 0;
  393. for (const auto &Check : Checks) {
  394. const auto &First = Check.first->Members, &Second = Check.second->Members;
  395. OS.indent(Depth) << "Check " << N++ << ":\n";
  396. OS.indent(Depth + 2) << "Comparing group (" << Check.first << "):\n";
  397. for (unsigned K = 0; K < First.size(); ++K)
  398. OS.indent(Depth + 2) << *Pointers[First[K]].PointerValue << "\n";
  399. OS.indent(Depth + 2) << "Against group (" << Check.second << "):\n";
  400. for (unsigned K = 0; K < Second.size(); ++K)
  401. OS.indent(Depth + 2) << *Pointers[Second[K]].PointerValue << "\n";
  402. }
  403. }
  404. void RuntimePointerChecking::print(raw_ostream &OS, unsigned Depth) const {
  405. OS.indent(Depth) << "Run-time memory checks:\n";
  406. printChecks(OS, Checks, Depth);
  407. OS.indent(Depth) << "Grouped accesses:\n";
  408. for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
  409. const auto &CG = CheckingGroups[I];
  410. OS.indent(Depth + 2) << "Group " << &CG << ":\n";
  411. OS.indent(Depth + 4) << "(Low: " << *CG.Low << " High: " << *CG.High
  412. << ")\n";
  413. for (unsigned J = 0; J < CG.Members.size(); ++J) {
  414. OS.indent(Depth + 6) << "Member: " << *Pointers[CG.Members[J]].Expr
  415. << "\n";
  416. }
  417. }
  418. }
  419. namespace {
  420. /// Analyses memory accesses in a loop.
  421. ///
  422. /// Checks whether run time pointer checks are needed and builds sets for data
  423. /// dependence checking.
  424. class AccessAnalysis {
  425. public:
  426. /// Read or write access location.
  427. typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
  428. typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
  429. AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AliasAnalysis *AA,
  430. LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
  431. PredicatedScalarEvolution &PSE)
  432. : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
  433. IsRTCheckAnalysisNeeded(false), PSE(PSE) {}
  434. /// Register a load and whether it is only read from.
  435. void addLoad(MemoryLocation &Loc, bool IsReadOnly) {
  436. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  437. AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
  438. Accesses.insert(MemAccessInfo(Ptr, false));
  439. if (IsReadOnly)
  440. ReadOnlyPtr.insert(Ptr);
  441. }
  442. /// Register a store.
  443. void addStore(MemoryLocation &Loc) {
  444. Value *Ptr = const_cast<Value*>(Loc.Ptr);
  445. AST.add(Ptr, LocationSize::unknown(), Loc.AATags);
  446. Accesses.insert(MemAccessInfo(Ptr, true));
  447. }
  448. /// Check if we can emit a run-time no-alias check for \p Access.
  449. ///
  450. /// Returns true if we can emit a run-time no alias check for \p Access.
  451. /// If we can check this access, this also adds it to a dependence set and
  452. /// adds a run-time to check for it to \p RtCheck. If \p Assume is true,
  453. /// we will attempt to use additional run-time checks in order to get
  454. /// the bounds of the pointer.
  455. bool createCheckForAccess(RuntimePointerChecking &RtCheck,
  456. MemAccessInfo Access,
  457. const ValueToValueMap &Strides,
  458. DenseMap<Value *, unsigned> &DepSetId,
  459. Loop *TheLoop, unsigned &RunningDepId,
  460. unsigned ASId, bool ShouldCheckStride,
  461. bool Assume);
  462. /// Check whether we can check the pointers at runtime for
  463. /// non-intersection.
  464. ///
  465. /// Returns true if we need no check or if we do and we can generate them
  466. /// (i.e. the pointers have computable bounds).
  467. bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
  468. Loop *TheLoop, const ValueToValueMap &Strides,
  469. bool ShouldCheckWrap = false);
  470. /// Goes over all memory accesses, checks whether a RT check is needed
  471. /// and builds sets of dependent accesses.
  472. void buildDependenceSets() {
  473. processMemAccesses();
  474. }
  475. /// Initial processing of memory accesses determined that we need to
  476. /// perform dependency checking.
  477. ///
  478. /// Note that this can later be cleared if we retry memcheck analysis without
  479. /// dependency checking (i.e. FoundNonConstantDistanceDependence).
  480. bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
  481. /// We decided that no dependence analysis would be used. Reset the state.
  482. void resetDepChecks(MemoryDepChecker &DepChecker) {
  483. CheckDeps.clear();
  484. DepChecker.clearDependences();
  485. }
  486. MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
  487. private:
  488. typedef SetVector<MemAccessInfo> PtrAccessSet;
  489. /// Go over all memory access and check whether runtime pointer checks
  490. /// are needed and build sets of dependency check candidates.
  491. void processMemAccesses();
  492. /// Set of all accesses.
  493. PtrAccessSet Accesses;
  494. const DataLayout &DL;
  495. /// The loop being checked.
  496. const Loop *TheLoop;
  497. /// List of accesses that need a further dependence check.
  498. MemAccessInfoList CheckDeps;
  499. /// Set of pointers that are read only.
  500. SmallPtrSet<Value*, 16> ReadOnlyPtr;
  501. /// An alias set tracker to partition the access set by underlying object and
  502. //intrinsic property (such as TBAA metadata).
  503. AliasSetTracker AST;
  504. LoopInfo *LI;
  505. /// Sets of potentially dependent accesses - members of one set share an
  506. /// underlying pointer. The set "CheckDeps" identfies which sets really need a
  507. /// dependence check.
  508. MemoryDepChecker::DepCandidates &DepCands;
  509. /// Initial processing of memory accesses determined that we may need
  510. /// to add memchecks. Perform the analysis to determine the necessary checks.
  511. ///
  512. /// Note that, this is different from isDependencyCheckNeeded. When we retry
  513. /// memcheck analysis without dependency checking
  514. /// (i.e. FoundNonConstantDistanceDependence), isDependencyCheckNeeded is
  515. /// cleared while this remains set if we have potentially dependent accesses.
  516. bool IsRTCheckAnalysisNeeded;
  517. /// The SCEV predicate containing all the SCEV-related assumptions.
  518. PredicatedScalarEvolution &PSE;
  519. };
  520. } // end anonymous namespace
  521. /// Check whether a pointer can participate in a runtime bounds check.
  522. /// If \p Assume, try harder to prove that we can compute the bounds of \p Ptr
  523. /// by adding run-time checks (overflow checks) if necessary.
  524. static bool hasComputableBounds(PredicatedScalarEvolution &PSE,
  525. const ValueToValueMap &Strides, Value *Ptr,
  526. Loop *L, bool Assume) {
  527. const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
  528. // The bounds for loop-invariant pointer is trivial.
  529. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  530. return true;
  531. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  532. if (!AR && Assume)
  533. AR = PSE.getAsAddRec(Ptr);
  534. if (!AR)
  535. return false;
  536. return AR->isAffine();
  537. }
  538. /// Check whether a pointer address cannot wrap.
  539. static bool isNoWrap(PredicatedScalarEvolution &PSE,
  540. const ValueToValueMap &Strides, Value *Ptr, Loop *L) {
  541. const SCEV *PtrScev = PSE.getSCEV(Ptr);
  542. if (PSE.getSE()->isLoopInvariant(PtrScev, L))
  543. return true;
  544. int64_t Stride = getPtrStride(PSE, Ptr, L, Strides);
  545. if (Stride == 1 || PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW))
  546. return true;
  547. return false;
  548. }
  549. bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
  550. MemAccessInfo Access,
  551. const ValueToValueMap &StridesMap,
  552. DenseMap<Value *, unsigned> &DepSetId,
  553. Loop *TheLoop, unsigned &RunningDepId,
  554. unsigned ASId, bool ShouldCheckWrap,
  555. bool Assume) {
  556. Value *Ptr = Access.getPointer();
  557. if (!hasComputableBounds(PSE, StridesMap, Ptr, TheLoop, Assume))
  558. return false;
  559. // When we run after a failing dependency check we have to make sure
  560. // we don't have wrapping pointers.
  561. if (ShouldCheckWrap && !isNoWrap(PSE, StridesMap, Ptr, TheLoop)) {
  562. auto *Expr = PSE.getSCEV(Ptr);
  563. if (!Assume || !isa<SCEVAddRecExpr>(Expr))
  564. return false;
  565. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  566. }
  567. // The id of the dependence set.
  568. unsigned DepId;
  569. if (isDependencyCheckNeeded()) {
  570. Value *Leader = DepCands.getLeaderValue(Access).getPointer();
  571. unsigned &LeaderId = DepSetId[Leader];
  572. if (!LeaderId)
  573. LeaderId = RunningDepId++;
  574. DepId = LeaderId;
  575. } else
  576. // Each access has its own dependence set.
  577. DepId = RunningDepId++;
  578. bool IsWrite = Access.getInt();
  579. RtCheck.insert(TheLoop, Ptr, IsWrite, DepId, ASId, StridesMap, PSE);
  580. LLVM_DEBUG(dbgs() << "LAA: Found a runtime check ptr:" << *Ptr << '\n');
  581. return true;
  582. }
  583. bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
  584. ScalarEvolution *SE, Loop *TheLoop,
  585. const ValueToValueMap &StridesMap,
  586. bool ShouldCheckWrap) {
  587. // Find pointers with computable bounds. We are going to use this information
  588. // to place a runtime bound check.
  589. bool CanDoRT = true;
  590. bool NeedRTCheck = false;
  591. if (!IsRTCheckAnalysisNeeded) return true;
  592. bool IsDepCheckNeeded = isDependencyCheckNeeded();
  593. // We assign a consecutive id to access from different alias sets.
  594. // Accesses between different groups doesn't need to be checked.
  595. unsigned ASId = 1;
  596. for (auto &AS : AST) {
  597. int NumReadPtrChecks = 0;
  598. int NumWritePtrChecks = 0;
  599. bool CanDoAliasSetRT = true;
  600. // We assign consecutive id to access from different dependence sets.
  601. // Accesses within the same set don't need a runtime check.
  602. unsigned RunningDepId = 1;
  603. DenseMap<Value *, unsigned> DepSetId;
  604. SmallVector<MemAccessInfo, 4> Retries;
  605. for (auto A : AS) {
  606. Value *Ptr = A.getValue();
  607. bool IsWrite = Accesses.count(MemAccessInfo(Ptr, true));
  608. MemAccessInfo Access(Ptr, IsWrite);
  609. if (IsWrite)
  610. ++NumWritePtrChecks;
  611. else
  612. ++NumReadPtrChecks;
  613. if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId, TheLoop,
  614. RunningDepId, ASId, ShouldCheckWrap, false)) {
  615. LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:" << *Ptr << '\n');
  616. Retries.push_back(Access);
  617. CanDoAliasSetRT = false;
  618. }
  619. }
  620. // If we have at least two writes or one write and a read then we need to
  621. // check them. But there is no need to checks if there is only one
  622. // dependence set for this alias set.
  623. //
  624. // Note that this function computes CanDoRT and NeedRTCheck independently.
  625. // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
  626. // for which we couldn't find the bounds but we don't actually need to emit
  627. // any checks so it does not matter.
  628. bool NeedsAliasSetRTCheck = false;
  629. if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
  630. NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
  631. (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1));
  632. // We need to perform run-time alias checks, but some pointers had bounds
  633. // that couldn't be checked.
  634. if (NeedsAliasSetRTCheck && !CanDoAliasSetRT) {
  635. // Reset the CanDoSetRt flag and retry all accesses that have failed.
  636. // We know that we need these checks, so we can now be more aggressive
  637. // and add further checks if required (overflow checks).
  638. CanDoAliasSetRT = true;
  639. for (auto Access : Retries)
  640. if (!createCheckForAccess(RtCheck, Access, StridesMap, DepSetId,
  641. TheLoop, RunningDepId, ASId,
  642. ShouldCheckWrap, /*Assume=*/true)) {
  643. CanDoAliasSetRT = false;
  644. break;
  645. }
  646. }
  647. CanDoRT &= CanDoAliasSetRT;
  648. NeedRTCheck |= NeedsAliasSetRTCheck;
  649. ++ASId;
  650. }
  651. // If the pointers that we would use for the bounds comparison have different
  652. // address spaces, assume the values aren't directly comparable, so we can't
  653. // use them for the runtime check. We also have to assume they could
  654. // overlap. In the future there should be metadata for whether address spaces
  655. // are disjoint.
  656. unsigned NumPointers = RtCheck.Pointers.size();
  657. for (unsigned i = 0; i < NumPointers; ++i) {
  658. for (unsigned j = i + 1; j < NumPointers; ++j) {
  659. // Only need to check pointers between two different dependency sets.
  660. if (RtCheck.Pointers[i].DependencySetId ==
  661. RtCheck.Pointers[j].DependencySetId)
  662. continue;
  663. // Only need to check pointers in the same alias set.
  664. if (RtCheck.Pointers[i].AliasSetId != RtCheck.Pointers[j].AliasSetId)
  665. continue;
  666. Value *PtrI = RtCheck.Pointers[i].PointerValue;
  667. Value *PtrJ = RtCheck.Pointers[j].PointerValue;
  668. unsigned ASi = PtrI->getType()->getPointerAddressSpace();
  669. unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
  670. if (ASi != ASj) {
  671. LLVM_DEBUG(
  672. dbgs() << "LAA: Runtime check would require comparison between"
  673. " different address spaces\n");
  674. return false;
  675. }
  676. }
  677. }
  678. if (NeedRTCheck && CanDoRT)
  679. RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
  680. LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
  681. << " pointer comparisons.\n");
  682. RtCheck.Need = NeedRTCheck;
  683. bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
  684. if (!CanDoRTIfNeeded)
  685. RtCheck.reset();
  686. return CanDoRTIfNeeded;
  687. }
  688. void AccessAnalysis::processMemAccesses() {
  689. // We process the set twice: first we process read-write pointers, last we
  690. // process read-only pointers. This allows us to skip dependence tests for
  691. // read-only pointers.
  692. LLVM_DEBUG(dbgs() << "LAA: Processing memory accesses...\n");
  693. LLVM_DEBUG(dbgs() << " AST: "; AST.dump());
  694. LLVM_DEBUG(dbgs() << "LAA: Accesses(" << Accesses.size() << "):\n");
  695. LLVM_DEBUG({
  696. for (auto A : Accesses)
  697. dbgs() << "\t" << *A.getPointer() << " (" <<
  698. (A.getInt() ? "write" : (ReadOnlyPtr.count(A.getPointer()) ?
  699. "read-only" : "read")) << ")\n";
  700. });
  701. // The AliasSetTracker has nicely partitioned our pointers by metadata
  702. // compatibility and potential for underlying-object overlap. As a result, we
  703. // only need to check for potential pointer dependencies within each alias
  704. // set.
  705. for (auto &AS : AST) {
  706. // Note that both the alias-set tracker and the alias sets themselves used
  707. // linked lists internally and so the iteration order here is deterministic
  708. // (matching the original instruction order within each set).
  709. bool SetHasWrite = false;
  710. // Map of pointers to last access encountered.
  711. typedef DenseMap<const Value*, MemAccessInfo> UnderlyingObjToAccessMap;
  712. UnderlyingObjToAccessMap ObjToLastAccess;
  713. // Set of access to check after all writes have been processed.
  714. PtrAccessSet DeferredAccesses;
  715. // Iterate over each alias set twice, once to process read/write pointers,
  716. // and then to process read-only pointers.
  717. for (int SetIteration = 0; SetIteration < 2; ++SetIteration) {
  718. bool UseDeferred = SetIteration > 0;
  719. PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
  720. for (auto AV : AS) {
  721. Value *Ptr = AV.getValue();
  722. // For a single memory access in AliasSetTracker, Accesses may contain
  723. // both read and write, and they both need to be handled for CheckDeps.
  724. for (auto AC : S) {
  725. if (AC.getPointer() != Ptr)
  726. continue;
  727. bool IsWrite = AC.getInt();
  728. // If we're using the deferred access set, then it contains only
  729. // reads.
  730. bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
  731. if (UseDeferred && !IsReadOnlyPtr)
  732. continue;
  733. // Otherwise, the pointer must be in the PtrAccessSet, either as a
  734. // read or a write.
  735. assert(((IsReadOnlyPtr && UseDeferred) || IsWrite ||
  736. S.count(MemAccessInfo(Ptr, false))) &&
  737. "Alias-set pointer not in the access set?");
  738. MemAccessInfo Access(Ptr, IsWrite);
  739. DepCands.insert(Access);
  740. // Memorize read-only pointers for later processing and skip them in
  741. // the first round (they need to be checked after we have seen all
  742. // write pointers). Note: we also mark pointer that are not
  743. // consecutive as "read-only" pointers (so that we check
  744. // "a[b[i]] +="). Hence, we need the second check for "!IsWrite".
  745. if (!UseDeferred && IsReadOnlyPtr) {
  746. DeferredAccesses.insert(Access);
  747. continue;
  748. }
  749. // If this is a write - check other reads and writes for conflicts. If
  750. // this is a read only check other writes for conflicts (but only if
  751. // there is no other write to the ptr - this is an optimization to
  752. // catch "a[i] = a[i] + " without having to do a dependence check).
  753. if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
  754. CheckDeps.push_back(Access);
  755. IsRTCheckAnalysisNeeded = true;
  756. }
  757. if (IsWrite)
  758. SetHasWrite = true;
  759. // Create sets of pointers connected by a shared alias set and
  760. // underlying object.
  761. typedef SmallVector<const Value *, 16> ValueVector;
  762. ValueVector TempObjects;
  763. GetUnderlyingObjects(Ptr, TempObjects, DL, LI);
  764. LLVM_DEBUG(dbgs()
  765. << "Underlying objects for pointer " << *Ptr << "\n");
  766. for (const Value *UnderlyingObj : TempObjects) {
  767. // nullptr never alias, don't join sets for pointer that have "null"
  768. // in their UnderlyingObjects list.
  769. if (isa<ConstantPointerNull>(UnderlyingObj) &&
  770. !NullPointerIsDefined(
  771. TheLoop->getHeader()->getParent(),
  772. UnderlyingObj->getType()->getPointerAddressSpace()))
  773. continue;
  774. UnderlyingObjToAccessMap::iterator Prev =
  775. ObjToLastAccess.find(UnderlyingObj);
  776. if (Prev != ObjToLastAccess.end())
  777. DepCands.unionSets(Access, Prev->second);
  778. ObjToLastAccess[UnderlyingObj] = Access;
  779. LLVM_DEBUG(dbgs() << " " << *UnderlyingObj << "\n");
  780. }
  781. }
  782. }
  783. }
  784. }
  785. }
  786. static bool isInBoundsGep(Value *Ptr) {
  787. if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
  788. return GEP->isInBounds();
  789. return false;
  790. }
  791. /// Return true if an AddRec pointer \p Ptr is unsigned non-wrapping,
  792. /// i.e. monotonically increasing/decreasing.
  793. static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
  794. PredicatedScalarEvolution &PSE, const Loop *L) {
  795. // FIXME: This should probably only return true for NUW.
  796. if (AR->getNoWrapFlags(SCEV::NoWrapMask))
  797. return true;
  798. // Scalar evolution does not propagate the non-wrapping flags to values that
  799. // are derived from a non-wrapping induction variable because non-wrapping
  800. // could be flow-sensitive.
  801. //
  802. // Look through the potentially overflowing instruction to try to prove
  803. // non-wrapping for the *specific* value of Ptr.
  804. // The arithmetic implied by an inbounds GEP can't overflow.
  805. auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  806. if (!GEP || !GEP->isInBounds())
  807. return false;
  808. // Make sure there is only one non-const index and analyze that.
  809. Value *NonConstIndex = nullptr;
  810. for (Value *Index : make_range(GEP->idx_begin(), GEP->idx_end()))
  811. if (!isa<ConstantInt>(Index)) {
  812. if (NonConstIndex)
  813. return false;
  814. NonConstIndex = Index;
  815. }
  816. if (!NonConstIndex)
  817. // The recurrence is on the pointer, ignore for now.
  818. return false;
  819. // The index in GEP is signed. It is non-wrapping if it's derived from a NSW
  820. // AddRec using a NSW operation.
  821. if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(NonConstIndex))
  822. if (OBO->hasNoSignedWrap() &&
  823. // Assume constant for other the operand so that the AddRec can be
  824. // easily found.
  825. isa<ConstantInt>(OBO->getOperand(1))) {
  826. auto *OpScev = PSE.getSCEV(OBO->getOperand(0));
  827. if (auto *OpAR = dyn_cast<SCEVAddRecExpr>(OpScev))
  828. return OpAR->getLoop() == L && OpAR->getNoWrapFlags(SCEV::FlagNSW);
  829. }
  830. return false;
  831. }
  832. /// Check whether the access through \p Ptr has a constant stride.
  833. int64_t llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
  834. const Loop *Lp, const ValueToValueMap &StridesMap,
  835. bool Assume, bool ShouldCheckWrap) {
  836. Type *Ty = Ptr->getType();
  837. assert(Ty->isPointerTy() && "Unexpected non-ptr");
  838. // Make sure that the pointer does not point to aggregate types.
  839. auto *PtrTy = cast<PointerType>(Ty);
  840. if (PtrTy->getElementType()->isAggregateType()) {
  841. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a pointer to a scalar type"
  842. << *Ptr << "\n");
  843. return 0;
  844. }
  845. const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
  846. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
  847. if (Assume && !AR)
  848. AR = PSE.getAsAddRec(Ptr);
  849. if (!AR) {
  850. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not an AddRecExpr pointer " << *Ptr
  851. << " SCEV: " << *PtrScev << "\n");
  852. return 0;
  853. }
  854. // The access function must stride over the innermost loop.
  855. if (Lp != AR->getLoop()) {
  856. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop "
  857. << *Ptr << " SCEV: " << *AR << "\n");
  858. return 0;
  859. }
  860. // The address calculation must not wrap. Otherwise, a dependence could be
  861. // inverted.
  862. // An inbounds getelementptr that is a AddRec with a unit stride
  863. // cannot wrap per definition. The unit stride requirement is checked later.
  864. // An getelementptr without an inbounds attribute and unit stride would have
  865. // to access the pointer value "0" which is undefined behavior in address
  866. // space 0, therefore we can also vectorize this case.
  867. bool IsInBoundsGEP = isInBoundsGep(Ptr);
  868. bool IsNoWrapAddRec = !ShouldCheckWrap ||
  869. PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
  870. isNoWrapAddRec(Ptr, AR, PSE, Lp);
  871. if (!IsNoWrapAddRec && !IsInBoundsGEP &&
  872. NullPointerIsDefined(Lp->getHeader()->getParent(),
  873. PtrTy->getAddressSpace())) {
  874. if (Assume) {
  875. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  876. IsNoWrapAddRec = true;
  877. LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap in the address space:\n"
  878. << "LAA: Pointer: " << *Ptr << "\n"
  879. << "LAA: SCEV: " << *AR << "\n"
  880. << "LAA: Added an overflow assumption\n");
  881. } else {
  882. LLVM_DEBUG(
  883. dbgs() << "LAA: Bad stride - Pointer may wrap in the address space "
  884. << *Ptr << " SCEV: " << *AR << "\n");
  885. return 0;
  886. }
  887. }
  888. // Check the step is constant.
  889. const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
  890. // Calculate the pointer stride and check if it is constant.
  891. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
  892. if (!C) {
  893. LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr
  894. << " SCEV: " << *AR << "\n");
  895. return 0;
  896. }
  897. auto &DL = Lp->getHeader()->getModule()->getDataLayout();
  898. int64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
  899. const APInt &APStepVal = C->getAPInt();
  900. // Huge step value - give up.
  901. if (APStepVal.getBitWidth() > 64)
  902. return 0;
  903. int64_t StepVal = APStepVal.getSExtValue();
  904. // Strided access.
  905. int64_t Stride = StepVal / Size;
  906. int64_t Rem = StepVal % Size;
  907. if (Rem)
  908. return 0;
  909. // If the SCEV could wrap but we have an inbounds gep with a unit stride we
  910. // know we can't "wrap around the address space". In case of address space
  911. // zero we know that this won't happen without triggering undefined behavior.
  912. if (!IsNoWrapAddRec && Stride != 1 && Stride != -1 &&
  913. (IsInBoundsGEP || !NullPointerIsDefined(Lp->getHeader()->getParent(),
  914. PtrTy->getAddressSpace()))) {
  915. if (Assume) {
  916. // We can avoid this case by adding a run-time check.
  917. LLVM_DEBUG(dbgs() << "LAA: Non unit strided pointer which is not either "
  918. << "inbounds or in address space 0 may wrap:\n"
  919. << "LAA: Pointer: " << *Ptr << "\n"
  920. << "LAA: SCEV: " << *AR << "\n"
  921. << "LAA: Added an overflow assumption\n");
  922. PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW);
  923. } else
  924. return 0;
  925. }
  926. return Stride;
  927. }
  928. bool llvm::sortPtrAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
  929. ScalarEvolution &SE,
  930. SmallVectorImpl<unsigned> &SortedIndices) {
  931. assert(llvm::all_of(
  932. VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
  933. "Expected list of pointer operands.");
  934. SmallVector<std::pair<int64_t, Value *>, 4> OffValPairs;
  935. OffValPairs.reserve(VL.size());
  936. // Walk over the pointers, and map each of them to an offset relative to
  937. // first pointer in the array.
  938. Value *Ptr0 = VL[0];
  939. const SCEV *Scev0 = SE.getSCEV(Ptr0);
  940. Value *Obj0 = GetUnderlyingObject(Ptr0, DL);
  941. llvm::SmallSet<int64_t, 4> Offsets;
  942. for (auto *Ptr : VL) {
  943. // TODO: Outline this code as a special, more time consuming, version of
  944. // computeConstantDifference() function.
  945. if (Ptr->getType()->getPointerAddressSpace() !=
  946. Ptr0->getType()->getPointerAddressSpace())
  947. return false;
  948. // If a pointer refers to a different underlying object, bail - the
  949. // pointers are by definition incomparable.
  950. Value *CurrObj = GetUnderlyingObject(Ptr, DL);
  951. if (CurrObj != Obj0)
  952. return false;
  953. const SCEV *Scev = SE.getSCEV(Ptr);
  954. const auto *Diff = dyn_cast<SCEVConstant>(SE.getMinusSCEV(Scev, Scev0));
  955. // The pointers may not have a constant offset from each other, or SCEV
  956. // may just not be smart enough to figure out they do. Regardless,
  957. // there's nothing we can do.
  958. if (!Diff)
  959. return false;
  960. // Check if the pointer with the same offset is found.
  961. int64_t Offset = Diff->getAPInt().getSExtValue();
  962. if (!Offsets.insert(Offset).second)
  963. return false;
  964. OffValPairs.emplace_back(Offset, Ptr);
  965. }
  966. SortedIndices.clear();
  967. SortedIndices.resize(VL.size());
  968. std::iota(SortedIndices.begin(), SortedIndices.end(), 0);
  969. // Sort the memory accesses and keep the order of their uses in UseOrder.
  970. llvm::stable_sort(SortedIndices, [&](unsigned Left, unsigned Right) {
  971. return OffValPairs[Left].first < OffValPairs[Right].first;
  972. });
  973. // Check if the order is consecutive already.
  974. if (llvm::all_of(SortedIndices, [&SortedIndices](const unsigned I) {
  975. return I == SortedIndices[I];
  976. }))
  977. SortedIndices.clear();
  978. return true;
  979. }
  980. /// Take the address space operand from the Load/Store instruction.
  981. /// Returns -1 if this is not a valid Load/Store instruction.
  982. static unsigned getAddressSpaceOperand(Value *I) {
  983. if (LoadInst *L = dyn_cast<LoadInst>(I))
  984. return L->getPointerAddressSpace();
  985. if (StoreInst *S = dyn_cast<StoreInst>(I))
  986. return S->getPointerAddressSpace();
  987. return -1;
  988. }
  989. /// Returns true if the memory operations \p A and \p B are consecutive.
  990. bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
  991. ScalarEvolution &SE, bool CheckType) {
  992. Value *PtrA = getLoadStorePointerOperand(A);
  993. Value *PtrB = getLoadStorePointerOperand(B);
  994. unsigned ASA = getAddressSpaceOperand(A);
  995. unsigned ASB = getAddressSpaceOperand(B);
  996. // Check that the address spaces match and that the pointers are valid.
  997. if (!PtrA || !PtrB || (ASA != ASB))
  998. return false;
  999. // Make sure that A and B are different pointers.
  1000. if (PtrA == PtrB)
  1001. return false;
  1002. // Make sure that A and B have the same type if required.
  1003. if (CheckType && PtrA->getType() != PtrB->getType())
  1004. return false;
  1005. unsigned IdxWidth = DL.getIndexSizeInBits(ASA);
  1006. Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
  1007. APInt Size(IdxWidth, DL.getTypeStoreSize(Ty));
  1008. APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
  1009. PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
  1010. PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
  1011. // OffsetDelta = OffsetB - OffsetA;
  1012. const SCEV *OffsetSCEVA = SE.getConstant(OffsetA);
  1013. const SCEV *OffsetSCEVB = SE.getConstant(OffsetB);
  1014. const SCEV *OffsetDeltaSCEV = SE.getMinusSCEV(OffsetSCEVB, OffsetSCEVA);
  1015. const SCEVConstant *OffsetDeltaC = dyn_cast<SCEVConstant>(OffsetDeltaSCEV);
  1016. const APInt &OffsetDelta = OffsetDeltaC->getAPInt();
  1017. // Check if they are based on the same pointer. That makes the offsets
  1018. // sufficient.
  1019. if (PtrA == PtrB)
  1020. return OffsetDelta == Size;
  1021. // Compute the necessary base pointer delta to have the necessary final delta
  1022. // equal to the size.
  1023. // BaseDelta = Size - OffsetDelta;
  1024. const SCEV *SizeSCEV = SE.getConstant(Size);
  1025. const SCEV *BaseDelta = SE.getMinusSCEV(SizeSCEV, OffsetDeltaSCEV);
  1026. // Otherwise compute the distance with SCEV between the base pointers.
  1027. const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
  1028. const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
  1029. const SCEV *X = SE.getAddExpr(PtrSCEVA, BaseDelta);
  1030. return X == PtrSCEVB;
  1031. }
  1032. MemoryDepChecker::VectorizationSafetyStatus
  1033. MemoryDepChecker::Dependence::isSafeForVectorization(DepType Type) {
  1034. switch (Type) {
  1035. case NoDep:
  1036. case Forward:
  1037. case BackwardVectorizable:
  1038. return VectorizationSafetyStatus::Safe;
  1039. case Unknown:
  1040. return VectorizationSafetyStatus::PossiblySafeWithRtChecks;
  1041. case ForwardButPreventsForwarding:
  1042. case Backward:
  1043. case BackwardVectorizableButPreventsForwarding:
  1044. return VectorizationSafetyStatus::Unsafe;
  1045. }
  1046. llvm_unreachable("unexpected DepType!");
  1047. }
  1048. bool MemoryDepChecker::Dependence::isBackward() const {
  1049. switch (Type) {
  1050. case NoDep:
  1051. case Forward:
  1052. case ForwardButPreventsForwarding:
  1053. case Unknown:
  1054. return false;
  1055. case BackwardVectorizable:
  1056. case Backward:
  1057. case BackwardVectorizableButPreventsForwarding:
  1058. return true;
  1059. }
  1060. llvm_unreachable("unexpected DepType!");
  1061. }
  1062. bool MemoryDepChecker::Dependence::isPossiblyBackward() const {
  1063. return isBackward() || Type == Unknown;
  1064. }
  1065. bool MemoryDepChecker::Dependence::isForward() const {
  1066. switch (Type) {
  1067. case Forward:
  1068. case ForwardButPreventsForwarding:
  1069. return true;
  1070. case NoDep:
  1071. case Unknown:
  1072. case BackwardVectorizable:
  1073. case Backward:
  1074. case BackwardVectorizableButPreventsForwarding:
  1075. return false;
  1076. }
  1077. llvm_unreachable("unexpected DepType!");
  1078. }
  1079. bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
  1080. uint64_t TypeByteSize) {
  1081. // If loads occur at a distance that is not a multiple of a feasible vector
  1082. // factor store-load forwarding does not take place.
  1083. // Positive dependences might cause troubles because vectorizing them might
  1084. // prevent store-load forwarding making vectorized code run a lot slower.
  1085. // a[i] = a[i-3] ^ a[i-8];
  1086. // The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
  1087. // hence on your typical architecture store-load forwarding does not take
  1088. // place. Vectorizing in such cases does not make sense.
  1089. // Store-load forwarding distance.
  1090. // After this many iterations store-to-load forwarding conflicts should not
  1091. // cause any slowdowns.
  1092. const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
  1093. // Maximum vector factor.
  1094. uint64_t MaxVFWithoutSLForwardIssues = std::min(
  1095. VectorizerParams::MaxVectorWidth * TypeByteSize, MaxSafeDepDistBytes);
  1096. // Compute the smallest VF at which the store and load would be misaligned.
  1097. for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
  1098. VF *= 2) {
  1099. // If the number of vector iteration between the store and the load are
  1100. // small we could incur conflicts.
  1101. if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
  1102. MaxVFWithoutSLForwardIssues = (VF >>= 1);
  1103. break;
  1104. }
  1105. }
  1106. if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
  1107. LLVM_DEBUG(
  1108. dbgs() << "LAA: Distance " << Distance
  1109. << " that could cause a store-load forwarding conflict\n");
  1110. return true;
  1111. }
  1112. if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
  1113. MaxVFWithoutSLForwardIssues !=
  1114. VectorizerParams::MaxVectorWidth * TypeByteSize)
  1115. MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
  1116. return false;
  1117. }
  1118. void MemoryDepChecker::mergeInStatus(VectorizationSafetyStatus S) {
  1119. if (Status < S)
  1120. Status = S;
  1121. }
  1122. /// Given a non-constant (unknown) dependence-distance \p Dist between two
  1123. /// memory accesses, that have the same stride whose absolute value is given
  1124. /// in \p Stride, and that have the same type size \p TypeByteSize,
  1125. /// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
  1126. /// possible to prove statically that the dependence distance is larger
  1127. /// than the range that the accesses will travel through the execution of
  1128. /// the loop. If so, return true; false otherwise. This is useful for
  1129. /// example in loops such as the following (PR31098):
  1130. /// for (i = 0; i < D; ++i) {
  1131. /// = out[i];
  1132. /// out[i+D] =
  1133. /// }
  1134. static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
  1135. const SCEV &BackedgeTakenCount,
  1136. const SCEV &Dist, uint64_t Stride,
  1137. uint64_t TypeByteSize) {
  1138. // If we can prove that
  1139. // (**) |Dist| > BackedgeTakenCount * Step
  1140. // where Step is the absolute stride of the memory accesses in bytes,
  1141. // then there is no dependence.
  1142. //
  1143. // Rationale:
  1144. // We basically want to check if the absolute distance (|Dist/Step|)
  1145. // is >= the loop iteration count (or > BackedgeTakenCount).
  1146. // This is equivalent to the Strong SIV Test (Practical Dependence Testing,
  1147. // Section 4.2.1); Note, that for vectorization it is sufficient to prove
  1148. // that the dependence distance is >= VF; This is checked elsewhere.
  1149. // But in some cases we can prune unknown dependence distances early, and
  1150. // even before selecting the VF, and without a runtime test, by comparing
  1151. // the distance against the loop iteration count. Since the vectorized code
  1152. // will be executed only if LoopCount >= VF, proving distance >= LoopCount
  1153. // also guarantees that distance >= VF.
  1154. //
  1155. const uint64_t ByteStride = Stride * TypeByteSize;
  1156. const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
  1157. const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
  1158. const SCEV *CastedDist = &Dist;
  1159. const SCEV *CastedProduct = Product;
  1160. uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
  1161. uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
  1162. // The dependence distance can be positive/negative, so we sign extend Dist;
  1163. // The multiplication of the absolute stride in bytes and the
  1164. // backedgeTakenCount is non-negative, so we zero extend Product.
  1165. if (DistTypeSize > ProductTypeSize)
  1166. CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
  1167. else
  1168. CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
  1169. // Is Dist - (BackedgeTakenCount * Step) > 0 ?
  1170. // (If so, then we have proven (**) because |Dist| >= Dist)
  1171. const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
  1172. if (SE.isKnownPositive(Minus))
  1173. return true;
  1174. // Second try: Is -Dist - (BackedgeTakenCount * Step) > 0 ?
  1175. // (If so, then we have proven (**) because |Dist| >= -1*Dist)
  1176. const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
  1177. Minus = SE.getMinusSCEV(NegDist, CastedProduct);
  1178. if (SE.isKnownPositive(Minus))
  1179. return true;
  1180. return false;
  1181. }
  1182. /// Check the dependence for two accesses with the same stride \p Stride.
  1183. /// \p Distance is the positive distance and \p TypeByteSize is type size in
  1184. /// bytes.
  1185. ///
  1186. /// \returns true if they are independent.
  1187. static bool areStridedAccessesIndependent(uint64_t Distance, uint64_t Stride,
  1188. uint64_t TypeByteSize) {
  1189. assert(Stride > 1 && "The stride must be greater than 1");
  1190. assert(TypeByteSize > 0 && "The type size in byte must be non-zero");
  1191. assert(Distance > 0 && "The distance must be non-zero");
  1192. // Skip if the distance is not multiple of type byte size.
  1193. if (Distance % TypeByteSize)
  1194. return false;
  1195. uint64_t ScaledDist = Distance / TypeByteSize;
  1196. // No dependence if the scaled distance is not multiple of the stride.
  1197. // E.g.
  1198. // for (i = 0; i < 1024 ; i += 4)
  1199. // A[i+2] = A[i] + 1;
  1200. //
  1201. // Two accesses in memory (scaled distance is 2, stride is 4):
  1202. // | A[0] | | | | A[4] | | | |
  1203. // | | | A[2] | | | | A[6] | |
  1204. //
  1205. // E.g.
  1206. // for (i = 0; i < 1024 ; i += 3)
  1207. // A[i+4] = A[i] + 1;
  1208. //
  1209. // Two accesses in memory (scaled distance is 4, stride is 3):
  1210. // | A[0] | | | A[3] | | | A[6] | | |
  1211. // | | | | | A[4] | | | A[7] | |
  1212. return ScaledDist % Stride;
  1213. }
  1214. MemoryDepChecker::Dependence::DepType
  1215. MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
  1216. const MemAccessInfo &B, unsigned BIdx,
  1217. const ValueToValueMap &Strides) {
  1218. assert (AIdx < BIdx && "Must pass arguments in program order");
  1219. Value *APtr = A.getPointer();
  1220. Value *BPtr = B.getPointer();
  1221. bool AIsWrite = A.getInt();
  1222. bool BIsWrite = B.getInt();
  1223. // Two reads are independent.
  1224. if (!AIsWrite && !BIsWrite)
  1225. return Dependence::NoDep;
  1226. // We cannot check pointers in different address spaces.
  1227. if (APtr->getType()->getPointerAddressSpace() !=
  1228. BPtr->getType()->getPointerAddressSpace())
  1229. return Dependence::Unknown;
  1230. int64_t StrideAPtr = getPtrStride(PSE, APtr, InnermostLoop, Strides, true);
  1231. int64_t StrideBPtr = getPtrStride(PSE, BPtr, InnermostLoop, Strides, true);
  1232. const SCEV *Src = PSE.getSCEV(APtr);
  1233. const SCEV *Sink = PSE.getSCEV(BPtr);
  1234. // If the induction step is negative we have to invert source and sink of the
  1235. // dependence.
  1236. if (StrideAPtr < 0) {
  1237. std::swap(APtr, BPtr);
  1238. std::swap(Src, Sink);
  1239. std::swap(AIsWrite, BIsWrite);
  1240. std::swap(AIdx, BIdx);
  1241. std::swap(StrideAPtr, StrideBPtr);
  1242. }
  1243. const SCEV *Dist = PSE.getSE()->getMinusSCEV(Sink, Src);
  1244. LLVM_DEBUG(dbgs() << "LAA: Src Scev: " << *Src << "Sink Scev: " << *Sink
  1245. << "(Induction step: " << StrideAPtr << ")\n");
  1246. LLVM_DEBUG(dbgs() << "LAA: Distance for " << *InstMap[AIdx] << " to "
  1247. << *InstMap[BIdx] << ": " << *Dist << "\n");
  1248. // Need accesses with constant stride. We don't want to vectorize
  1249. // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
  1250. // the address space.
  1251. if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
  1252. LLVM_DEBUG(dbgs() << "Pointer access with non-constant stride\n");
  1253. return Dependence::Unknown;
  1254. }
  1255. Type *ATy = APtr->getType()->getPointerElementType();
  1256. Type *BTy = BPtr->getType()->getPointerElementType();
  1257. auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
  1258. uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
  1259. uint64_t Stride = std::abs(StrideAPtr);
  1260. const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
  1261. if (!C) {
  1262. if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
  1263. isSafeDependenceDistance(DL, *(PSE.getSE()),
  1264. *(PSE.getBackedgeTakenCount()), *Dist, Stride,
  1265. TypeByteSize))
  1266. return Dependence::NoDep;
  1267. LLVM_DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
  1268. FoundNonConstantDistanceDependence = true;
  1269. return Dependence::Unknown;
  1270. }
  1271. const APInt &Val = C->getAPInt();
  1272. int64_t Distance = Val.getSExtValue();
  1273. // Attempt to prove strided accesses independent.
  1274. if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
  1275. areStridedAccessesIndependent(std::abs(Distance), Stride, TypeByteSize)) {
  1276. LLVM_DEBUG(dbgs() << "LAA: Strided accesses are independent\n");
  1277. return Dependence::NoDep;
  1278. }
  1279. // Negative distances are not plausible dependencies.
  1280. if (Val.isNegative()) {
  1281. bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
  1282. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1283. (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
  1284. ATy != BTy)) {
  1285. LLVM_DEBUG(dbgs() << "LAA: Forward but may prevent st->ld forwarding\n");
  1286. return Dependence::ForwardButPreventsForwarding;
  1287. }
  1288. LLVM_DEBUG(dbgs() << "LAA: Dependence is negative\n");
  1289. return Dependence::Forward;
  1290. }
  1291. // Write to the same location with the same size.
  1292. // Could be improved to assert type sizes are the same (i32 == float, etc).
  1293. if (Val == 0) {
  1294. if (ATy == BTy)
  1295. return Dependence::Forward;
  1296. LLVM_DEBUG(
  1297. dbgs() << "LAA: Zero dependence difference but different types\n");
  1298. return Dependence::Unknown;
  1299. }
  1300. assert(Val.isStrictlyPositive() && "Expect a positive value");
  1301. if (ATy != BTy) {
  1302. LLVM_DEBUG(
  1303. dbgs()
  1304. << "LAA: ReadWrite-Write positive dependency with different types\n");
  1305. return Dependence::Unknown;
  1306. }
  1307. // Bail out early if passed-in parameters make vectorization not feasible.
  1308. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ?
  1309. VectorizerParams::VectorizationFactor : 1);
  1310. unsigned ForcedUnroll = (VectorizerParams::VectorizationInterleave ?
  1311. VectorizerParams::VectorizationInterleave : 1);
  1312. // The minimum number of iterations for a vectorized/unrolled version.
  1313. unsigned MinNumIter = std::max(ForcedFactor * ForcedUnroll, 2U);
  1314. // It's not vectorizable if the distance is smaller than the minimum distance
  1315. // needed for a vectroized/unrolled version. Vectorizing one iteration in
  1316. // front needs TypeByteSize * Stride. Vectorizing the last iteration needs
  1317. // TypeByteSize (No need to plus the last gap distance).
  1318. //
  1319. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1320. // foo(int *A) {
  1321. // int *B = (int *)((char *)A + 14);
  1322. // for (i = 0 ; i < 1024 ; i += 2)
  1323. // B[i] = A[i] + 1;
  1324. // }
  1325. //
  1326. // Two accesses in memory (stride is 2):
  1327. // | A[0] | | A[2] | | A[4] | | A[6] | |
  1328. // | B[0] | | B[2] | | B[4] |
  1329. //
  1330. // Distance needs for vectorizing iterations except the last iteration:
  1331. // 4 * 2 * (MinNumIter - 1). Distance needs for the last iteration: 4.
  1332. // So the minimum distance needed is: 4 * 2 * (MinNumIter - 1) + 4.
  1333. //
  1334. // If MinNumIter is 2, it is vectorizable as the minimum distance needed is
  1335. // 12, which is less than distance.
  1336. //
  1337. // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4),
  1338. // the minimum distance needed is 28, which is greater than distance. It is
  1339. // not safe to do vectorization.
  1340. uint64_t MinDistanceNeeded =
  1341. TypeByteSize * Stride * (MinNumIter - 1) + TypeByteSize;
  1342. if (MinDistanceNeeded > static_cast<uint64_t>(Distance)) {
  1343. LLVM_DEBUG(dbgs() << "LAA: Failure because of positive distance "
  1344. << Distance << '\n');
  1345. return Dependence::Backward;
  1346. }
  1347. // Unsafe if the minimum distance needed is greater than max safe distance.
  1348. if (MinDistanceNeeded > MaxSafeDepDistBytes) {
  1349. LLVM_DEBUG(dbgs() << "LAA: Failure because it needs at least "
  1350. << MinDistanceNeeded << " size in bytes");
  1351. return Dependence::Backward;
  1352. }
  1353. // Positive distance bigger than max vectorization factor.
  1354. // FIXME: Should use max factor instead of max distance in bytes, which could
  1355. // not handle different types.
  1356. // E.g. Assume one char is 1 byte in memory and one int is 4 bytes.
  1357. // void foo (int *A, char *B) {
  1358. // for (unsigned i = 0; i < 1024; i++) {
  1359. // A[i+2] = A[i] + 1;
  1360. // B[i+2] = B[i] + 1;
  1361. // }
  1362. // }
  1363. //
  1364. // This case is currently unsafe according to the max safe distance. If we
  1365. // analyze the two accesses on array B, the max safe dependence distance
  1366. // is 2. Then we analyze the accesses on array A, the minimum distance needed
  1367. // is 8, which is less than 2 and forbidden vectorization, But actually
  1368. // both A and B could be vectorized by 2 iterations.
  1369. MaxSafeDepDistBytes =
  1370. std::min(static_cast<uint64_t>(Distance), MaxSafeDepDistBytes);
  1371. bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
  1372. if (IsTrueDataDependence && EnableForwardingConflictDetection &&
  1373. couldPreventStoreLoadForward(Distance, TypeByteSize))
  1374. return Dependence::BackwardVectorizableButPreventsForwarding;
  1375. uint64_t MaxVF = MaxSafeDepDistBytes / (TypeByteSize * Stride);
  1376. LLVM_DEBUG(dbgs() << "LAA: Positive distance " << Val.getSExtValue()
  1377. << " with max VF = " << MaxVF << '\n');
  1378. uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
  1379. MaxSafeRegisterWidth = std::min(MaxSafeRegisterWidth, MaxVFInBits);
  1380. return Dependence::BackwardVectorizable;
  1381. }
  1382. bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
  1383. MemAccessInfoList &CheckDeps,
  1384. const ValueToValueMap &Strides) {
  1385. MaxSafeDepDistBytes = -1;
  1386. SmallPtrSet<MemAccessInfo, 8> Visited;
  1387. for (MemAccessInfo CurAccess : CheckDeps) {
  1388. if (Visited.count(CurAccess))
  1389. continue;
  1390. // Get the relevant memory access set.
  1391. EquivalenceClasses<MemAccessInfo>::iterator I =
  1392. AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
  1393. // Check accesses within this set.
  1394. EquivalenceClasses<MemAccessInfo>::member_iterator AI =
  1395. AccessSets.member_begin(I);
  1396. EquivalenceClasses<MemAccessInfo>::member_iterator AE =
  1397. AccessSets.member_end();
  1398. // Check every access pair.
  1399. while (AI != AE) {
  1400. Visited.insert(*AI);
  1401. EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
  1402. while (OI != AE) {
  1403. // Check every accessing instruction pair in program order.
  1404. for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
  1405. I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
  1406. for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
  1407. I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
  1408. auto A = std::make_pair(&*AI, *I1);
  1409. auto B = std::make_pair(&*OI, *I2);
  1410. assert(*I1 != *I2);
  1411. if (*I1 > *I2)
  1412. std::swap(A, B);
  1413. Dependence::DepType Type =
  1414. isDependent(*A.first, A.second, *B.first, B.second, Strides);
  1415. mergeInStatus(Dependence::isSafeForVectorization(Type));
  1416. // Gather dependences unless we accumulated MaxDependences
  1417. // dependences. In that case return as soon as we find the first
  1418. // unsafe dependence. This puts a limit on this quadratic
  1419. // algorithm.
  1420. if (RecordDependences) {
  1421. if (Type != Dependence::NoDep)
  1422. Dependences.push_back(Dependence(A.second, B.second, Type));
  1423. if (Dependences.size() >= MaxDependences) {
  1424. RecordDependences = false;
  1425. Dependences.clear();
  1426. LLVM_DEBUG(dbgs()
  1427. << "Too many dependences, stopped recording\n");
  1428. }
  1429. }
  1430. if (!RecordDependences && !isSafeForVectorization())
  1431. return false;
  1432. }
  1433. ++OI;
  1434. }
  1435. AI++;
  1436. }
  1437. }
  1438. LLVM_DEBUG(dbgs() << "Total Dependences: " << Dependences.size() << "\n");
  1439. return isSafeForVectorization();
  1440. }
  1441. SmallVector<Instruction *, 4>
  1442. MemoryDepChecker::getInstructionsForAccess(Value *Ptr, bool isWrite) const {
  1443. MemAccessInfo Access(Ptr, isWrite);
  1444. auto &IndexVector = Accesses.find(Access)->second;
  1445. SmallVector<Instruction *, 4> Insts;
  1446. transform(IndexVector,
  1447. std::back_inserter(Insts),
  1448. [&](unsigned Idx) { return this->InstMap[Idx]; });
  1449. return Insts;
  1450. }
  1451. const char *MemoryDepChecker::Dependence::DepName[] = {
  1452. "NoDep", "Unknown", "Forward", "ForwardButPreventsForwarding", "Backward",
  1453. "BackwardVectorizable", "BackwardVectorizableButPreventsForwarding"};
  1454. void MemoryDepChecker::Dependence::print(
  1455. raw_ostream &OS, unsigned Depth,
  1456. const SmallVectorImpl<Instruction *> &Instrs) const {
  1457. OS.indent(Depth) << DepName[Type] << ":\n";
  1458. OS.indent(Depth + 2) << *Instrs[Source] << " -> \n";
  1459. OS.indent(Depth + 2) << *Instrs[Destination] << "\n";
  1460. }
  1461. bool LoopAccessInfo::canAnalyzeLoop() {
  1462. // We need to have a loop header.
  1463. LLVM_DEBUG(dbgs() << "LAA: Found a loop in "
  1464. << TheLoop->getHeader()->getParent()->getName() << ": "
  1465. << TheLoop->getHeader()->getName() << '\n');
  1466. // We can only analyze innermost loops.
  1467. if (!TheLoop->empty()) {
  1468. LLVM_DEBUG(dbgs() << "LAA: loop is not the innermost loop\n");
  1469. recordAnalysis("NotInnerMostLoop") << "loop is not the innermost loop";
  1470. return false;
  1471. }
  1472. // We must have a single backedge.
  1473. if (TheLoop->getNumBackEdges() != 1) {
  1474. LLVM_DEBUG(
  1475. dbgs() << "LAA: loop control flow is not understood by analyzer\n");
  1476. recordAnalysis("CFGNotUnderstood")
  1477. << "loop control flow is not understood by analyzer";
  1478. return false;
  1479. }
  1480. // We must have a single exiting block.
  1481. if (!TheLoop->getExitingBlock()) {
  1482. LLVM_DEBUG(
  1483. dbgs() << "LAA: loop control flow is not understood by analyzer\n");
  1484. recordAnalysis("CFGNotUnderstood")
  1485. << "loop control flow is not understood by analyzer";
  1486. return false;
  1487. }
  1488. // We only handle bottom-tested loops, i.e. loop in which the condition is
  1489. // checked at the end of each iteration. With that we can assume that all
  1490. // instructions in the loop are executed the same number of times.
  1491. if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
  1492. LLVM_DEBUG(
  1493. dbgs() << "LAA: loop control flow is not understood by analyzer\n");
  1494. recordAnalysis("CFGNotUnderstood")
  1495. << "loop control flow is not understood by analyzer";
  1496. return false;
  1497. }
  1498. // ScalarEvolution needs to be able to find the exit count.
  1499. const SCEV *ExitCount = PSE->getBackedgeTakenCount();
  1500. if (ExitCount == PSE->getSE()->getCouldNotCompute()) {
  1501. recordAnalysis("CantComputeNumberOfIterations")
  1502. << "could not determine number of loop iterations";
  1503. LLVM_DEBUG(dbgs() << "LAA: SCEV could not compute the loop exit count.\n");
  1504. return false;
  1505. }
  1506. return true;
  1507. }
  1508. void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
  1509. const TargetLibraryInfo *TLI,
  1510. DominatorTree *DT) {
  1511. typedef SmallPtrSet<Value*, 16> ValueSet;
  1512. // Holds the Load and Store instructions.
  1513. SmallVector<LoadInst *, 16> Loads;
  1514. SmallVector<StoreInst *, 16> Stores;
  1515. // Holds all the different accesses in the loop.
  1516. unsigned NumReads = 0;
  1517. unsigned NumReadWrites = 0;
  1518. PtrRtChecking->Pointers.clear();
  1519. PtrRtChecking->Need = false;
  1520. const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
  1521. // For each block.
  1522. for (BasicBlock *BB : TheLoop->blocks()) {
  1523. // Scan the BB and collect legal loads and stores.
  1524. for (Instruction &I : *BB) {
  1525. // If this is a load, save it. If this instruction can read from memory
  1526. // but is not a load, then we quit. Notice that we don't handle function
  1527. // calls that read or write.
  1528. if (I.mayReadFromMemory()) {
  1529. // Many math library functions read the rounding mode. We will only
  1530. // vectorize a loop if it contains known function calls that don't set
  1531. // the flag. Therefore, it is safe to ignore this read from memory.
  1532. auto *Call = dyn_cast<CallInst>(&I);
  1533. if (Call && getVectorIntrinsicIDForCall(Call, TLI))
  1534. continue;
  1535. // If the function has an explicit vectorized counterpart, we can safely
  1536. // assume that it can be vectorized.
  1537. if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
  1538. TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
  1539. continue;
  1540. auto *Ld = dyn_cast<LoadInst>(&I);
  1541. if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
  1542. recordAnalysis("NonSimpleLoad", Ld)
  1543. << "read with atomic ordering or volatile read";
  1544. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple load.\n");
  1545. CanVecMem = false;
  1546. return;
  1547. }
  1548. NumLoads++;
  1549. Loads.push_back(Ld);
  1550. DepChecker->addAccess(Ld);
  1551. if (EnableMemAccessVersioning)
  1552. collectStridedAccess(Ld);
  1553. continue;
  1554. }
  1555. // Save 'store' instructions. Abort if other instructions write to memory.
  1556. if (I.mayWriteToMemory()) {
  1557. auto *St = dyn_cast<StoreInst>(&I);
  1558. if (!St) {
  1559. recordAnalysis("CantVectorizeInstruction", St)
  1560. << "instruction cannot be vectorized";
  1561. CanVecMem = false;
  1562. return;
  1563. }
  1564. if (!St->isSimple() && !IsAnnotatedParallel) {
  1565. recordAnalysis("NonSimpleStore", St)
  1566. << "write with atomic ordering or volatile write";
  1567. LLVM_DEBUG(dbgs() << "LAA: Found a non-simple store.\n");
  1568. CanVecMem = false;
  1569. return;
  1570. }
  1571. NumStores++;
  1572. Stores.push_back(St);
  1573. DepChecker->addAccess(St);
  1574. if (EnableMemAccessVersioning)
  1575. collectStridedAccess(St);
  1576. }
  1577. } // Next instr.
  1578. } // Next block.
  1579. // Now we have two lists that hold the loads and the stores.
  1580. // Next, we find the pointers that they use.
  1581. // Check if we see any stores. If there are no stores, then we don't
  1582. // care if the pointers are *restrict*.
  1583. if (!Stores.size()) {
  1584. LLVM_DEBUG(dbgs() << "LAA: Found a read-only loop!\n");
  1585. CanVecMem = true;
  1586. return;
  1587. }
  1588. MemoryDepChecker::DepCandidates DependentAccesses;
  1589. AccessAnalysis Accesses(TheLoop->getHeader()->getModule()->getDataLayout(),
  1590. TheLoop, AA, LI, DependentAccesses, *PSE);
  1591. // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
  1592. // multiple times on the same object. If the ptr is accessed twice, once
  1593. // for read and once for write, it will only appear once (on the write
  1594. // list). This is okay, since we are going to check for conflicts between
  1595. // writes and between reads and writes, but not between reads and reads.
  1596. ValueSet Seen;
  1597. // Record uniform store addresses to identify if we have multiple stores
  1598. // to the same address.
  1599. ValueSet UniformStores;
  1600. for (StoreInst *ST : Stores) {
  1601. Value *Ptr = ST->getPointerOperand();
  1602. if (isUniform(Ptr))
  1603. HasDependenceInvolvingLoopInvariantAddress |=
  1604. !UniformStores.insert(Ptr).second;
  1605. // If we did *not* see this pointer before, insert it to the read-write
  1606. // list. At this phase it is only a 'write' list.
  1607. if (Seen.insert(Ptr).second) {
  1608. ++NumReadWrites;
  1609. MemoryLocation Loc = MemoryLocation::get(ST);
  1610. // The TBAA metadata could have a control dependency on the predication
  1611. // condition, so we cannot rely on it when determining whether or not we
  1612. // need runtime pointer checks.
  1613. if (blockNeedsPredication(ST->getParent(), TheLoop, DT))
  1614. Loc.AATags.TBAA = nullptr;
  1615. Accesses.addStore(Loc);
  1616. }
  1617. }
  1618. if (IsAnnotatedParallel) {
  1619. LLVM_DEBUG(
  1620. dbgs() << "LAA: A loop annotated parallel, ignore memory dependency "
  1621. << "checks.\n");
  1622. CanVecMem = true;
  1623. return;
  1624. }
  1625. for (LoadInst *LD : Loads) {
  1626. Value *Ptr = LD->getPointerOperand();
  1627. // If we did *not* see this pointer before, insert it to the
  1628. // read list. If we *did* see it before, then it is already in
  1629. // the read-write list. This allows us to vectorize expressions
  1630. // such as A[i] += x; Because the address of A[i] is a read-write
  1631. // pointer. This only works if the index of A[i] is consecutive.
  1632. // If the address of i is unknown (for example A[B[i]]) then we may
  1633. // read a few words, modify, and write a few words, and some of the
  1634. // words may be written to the same address.
  1635. bool IsReadOnlyPtr = false;
  1636. if (Seen.insert(Ptr).second ||
  1637. !getPtrStride(*PSE, Ptr, TheLoop, SymbolicStrides)) {
  1638. ++NumReads;
  1639. IsReadOnlyPtr = true;
  1640. }
  1641. // See if there is an unsafe dependency between a load to a uniform address and
  1642. // store to the same uniform address.
  1643. if (UniformStores.count(Ptr)) {
  1644. LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
  1645. "load and uniform store to the same address!\n");
  1646. HasDependenceInvolvingLoopInvariantAddress = true;
  1647. }
  1648. MemoryLocation Loc = MemoryLocation::get(LD);
  1649. // The TBAA metadata could have a control dependency on the predication
  1650. // condition, so we cannot rely on it when determining whether or not we
  1651. // need runtime pointer checks.
  1652. if (blockNeedsPredication(LD->getParent(), TheLoop, DT))
  1653. Loc.AATags.TBAA = nullptr;
  1654. Accesses.addLoad(Loc, IsReadOnlyPtr);
  1655. }
  1656. // If we write (or read-write) to a single destination and there are no
  1657. // other reads in this loop then is it safe to vectorize.
  1658. if (NumReadWrites == 1 && NumReads == 0) {
  1659. LLVM_DEBUG(dbgs() << "LAA: Found a write-only loop!\n");
  1660. CanVecMem = true;
  1661. return;
  1662. }
  1663. // Build dependence sets and check whether we need a runtime pointer bounds
  1664. // check.
  1665. Accesses.buildDependenceSets();
  1666. // Find pointers with computable bounds. We are going to use this information
  1667. // to place a runtime bound check.
  1668. bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(),
  1669. TheLoop, SymbolicStrides);
  1670. if (!CanDoRTIfNeeded) {
  1671. recordAnalysis("CantIdentifyArrayBounds") << "cannot identify array bounds";
  1672. LLVM_DEBUG(dbgs() << "LAA: We can't vectorize because we can't find "
  1673. << "the array bounds.\n");
  1674. CanVecMem = false;
  1675. return;
  1676. }
  1677. LLVM_DEBUG(
  1678. dbgs() << "LAA: We can perform a memory runtime check if needed.\n");
  1679. CanVecMem = true;
  1680. if (Accesses.isDependencyCheckNeeded()) {
  1681. LLVM_DEBUG(dbgs() << "LAA: Checking memory dependencies\n");
  1682. CanVecMem = DepChecker->areDepsSafe(
  1683. DependentAccesses, Accesses.getDependenciesToCheck(), SymbolicStrides);
  1684. MaxSafeDepDistBytes = DepChecker->getMaxSafeDepDistBytes();
  1685. if (!CanVecMem && DepChecker->shouldRetryWithRuntimeCheck()) {
  1686. LLVM_DEBUG(dbgs() << "LAA: Retrying with memory checks\n");
  1687. // Clear the dependency checks. We assume they are not needed.
  1688. Accesses.resetDepChecks(*DepChecker);
  1689. PtrRtChecking->reset();
  1690. PtrRtChecking->Need = true;
  1691. auto *SE = PSE->getSE();
  1692. CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(*PtrRtChecking, SE, TheLoop,
  1693. SymbolicStrides, true);
  1694. // Check that we found the bounds for the pointer.
  1695. if (!CanDoRTIfNeeded) {
  1696. recordAnalysis("CantCheckMemDepsAtRunTime")
  1697. << "cannot check memory dependencies at runtime";
  1698. LLVM_DEBUG(dbgs() << "LAA: Can't vectorize with memory checks\n");
  1699. CanVecMem = false;
  1700. return;
  1701. }
  1702. CanVecMem = true;
  1703. }
  1704. }
  1705. if (CanVecMem)
  1706. LLVM_DEBUG(
  1707. dbgs() << "LAA: No unsafe dependent memory operations in loop. We"
  1708. << (PtrRtChecking->Need ? "" : " don't")
  1709. << " need runtime memory checks.\n");
  1710. else {
  1711. recordAnalysis("UnsafeMemDep")
  1712. << "unsafe dependent memory operations in loop. Use "
  1713. "#pragma loop distribute(enable) to allow loop distribution "
  1714. "to attempt to isolate the offending operations into a separate "
  1715. "loop";
  1716. LLVM_DEBUG(dbgs() << "LAA: unsafe dependent memory operations in loop\n");
  1717. }
  1718. }
  1719. bool LoopAccessInfo::blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
  1720. DominatorTree *DT) {
  1721. assert(TheLoop->contains(BB) && "Unknown block used");
  1722. // Blocks that do not dominate the latch need predication.
  1723. BasicBlock* Latch = TheLoop->getLoopLatch();
  1724. return !DT->dominates(BB, Latch);
  1725. }
  1726. OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
  1727. Instruction *I) {
  1728. assert(!Report && "Multiple reports generated");
  1729. Value *CodeRegion = TheLoop->getHeader();
  1730. DebugLoc DL = TheLoop->getStartLoc();
  1731. if (I) {
  1732. CodeRegion = I->getParent();
  1733. // If there is no debug location attached to the instruction, revert back to
  1734. // using the loop's.
  1735. if (I->getDebugLoc())
  1736. DL = I->getDebugLoc();
  1737. }
  1738. Report = make_unique<OptimizationRemarkAnalysis>(DEBUG_TYPE, RemarkName, DL,
  1739. CodeRegion);
  1740. return *Report;
  1741. }
  1742. bool LoopAccessInfo::isUniform(Value *V) const {
  1743. auto *SE = PSE->getSE();
  1744. // Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
  1745. // never considered uniform.
  1746. // TODO: Is this really what we want? Even without FP SCEV, we may want some
  1747. // trivially loop-invariant FP values to be considered uniform.
  1748. if (!SE->isSCEVable(V->getType()))
  1749. return false;
  1750. return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
  1751. }
  1752. // FIXME: this function is currently a duplicate of the one in
  1753. // LoopVectorize.cpp.
  1754. static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
  1755. Instruction *Loc) {
  1756. if (FirstInst)
  1757. return FirstInst;
  1758. if (Instruction *I = dyn_cast<Instruction>(V))
  1759. return I->getParent() == Loc->getParent() ? I : nullptr;
  1760. return nullptr;
  1761. }
  1762. namespace {
  1763. /// IR Values for the lower and upper bounds of a pointer evolution. We
  1764. /// need to use value-handles because SCEV expansion can invalidate previously
  1765. /// expanded values. Thus expansion of a pointer can invalidate the bounds for
  1766. /// a previous one.
  1767. struct PointerBounds {
  1768. TrackingVH<Value> Start;
  1769. TrackingVH<Value> End;
  1770. };
  1771. } // end anonymous namespace
  1772. /// Expand code for the lower and upper bound of the pointer group \p CG
  1773. /// in \p TheLoop. \return the values for the bounds.
  1774. static PointerBounds
  1775. expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
  1776. Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
  1777. const RuntimePointerChecking &PtrRtChecking) {
  1778. Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
  1779. const SCEV *Sc = SE->getSCEV(Ptr);
  1780. unsigned AS = Ptr->getType()->getPointerAddressSpace();
  1781. LLVMContext &Ctx = Loc->getContext();
  1782. // Use this type for pointer arithmetic.
  1783. Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
  1784. if (SE->isLoopInvariant(Sc, TheLoop)) {
  1785. LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
  1786. << *Ptr << "\n");
  1787. // Ptr could be in the loop body. If so, expand a new one at the correct
  1788. // location.
  1789. Instruction *Inst = dyn_cast<Instruction>(Ptr);
  1790. Value *NewPtr = (Inst && TheLoop->contains(Inst))
  1791. ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
  1792. : Ptr;
  1793. // We must return a half-open range, which means incrementing Sc.
  1794. const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
  1795. Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
  1796. return {NewPtr, NewPtrPlusOne};
  1797. } else {
  1798. Value *Start = nullptr, *End = nullptr;
  1799. LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
  1800. Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
  1801. End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
  1802. LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
  1803. << "\n");
  1804. return {Start, End};
  1805. }
  1806. }
  1807. /// Turns a collection of checks into a collection of expanded upper and
  1808. /// lower bounds for both pointers in the check.
  1809. static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
  1810. const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
  1811. Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
  1812. const RuntimePointerChecking &PtrRtChecking) {
  1813. SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
  1814. // Here we're relying on the SCEV Expander's cache to only emit code for the
  1815. // same bounds once.
  1816. transform(
  1817. PointerChecks, std::back_inserter(ChecksWithBounds),
  1818. [&](const RuntimePointerChecking::PointerCheck &Check) {
  1819. PointerBounds
  1820. First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
  1821. Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
  1822. return std::make_pair(First, Second);
  1823. });
  1824. return ChecksWithBounds;
  1825. }
  1826. std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
  1827. Instruction *Loc,
  1828. const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
  1829. const {
  1830. const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
  1831. auto *SE = PSE->getSE();
  1832. SCEVExpander Exp(*SE, DL, "induction");
  1833. auto ExpandedChecks =
  1834. expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, *PtrRtChecking);
  1835. LLVMContext &Ctx = Loc->getContext();
  1836. Instruction *FirstInst = nullptr;
  1837. IRBuilder<> ChkBuilder(Loc);
  1838. // Our instructions might fold to a constant.
  1839. Value *MemoryRuntimeCheck = nullptr;
  1840. for (const auto &Check : ExpandedChecks) {
  1841. const PointerBounds &A = Check.first, &B = Check.second;
  1842. // Check if two pointers (A and B) conflict where conflict is computed as:
  1843. // start(A) <= end(B) && start(B) <= end(A)
  1844. unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
  1845. unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
  1846. assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
  1847. (AS1 == A.End->getType()->getPointerAddressSpace()) &&
  1848. "Trying to bounds check pointers with different address spaces");
  1849. Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
  1850. Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
  1851. Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
  1852. Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
  1853. Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
  1854. Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
  1855. // [A|B].Start points to the first accessed byte under base [A|B].
  1856. // [A|B].End points to the last accessed byte, plus one.
  1857. // There is no conflict when the intervals are disjoint:
  1858. // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
  1859. //
  1860. // bound0 = (B.Start < A.End)
  1861. // bound1 = (A.Start < B.End)
  1862. // IsConflict = bound0 & bound1
  1863. Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
  1864. FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
  1865. Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
  1866. FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
  1867. Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
  1868. FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
  1869. if (MemoryRuntimeCheck) {
  1870. IsConflict =
  1871. ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
  1872. FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
  1873. }
  1874. MemoryRuntimeCheck = IsConflict;
  1875. }
  1876. if (!MemoryRuntimeCheck)
  1877. return std::make_pair(nullptr, nullptr);
  1878. // We have to do this trickery because the IRBuilder might fold the check to a
  1879. // constant expression in which case there is no Instruction anchored in a
  1880. // the block.
  1881. Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
  1882. ConstantInt::getTrue(Ctx));
  1883. ChkBuilder.Insert(Check, "memcheck.conflict");
  1884. FirstInst = getFirstInst(FirstInst, Check, Loc);
  1885. return std::make_pair(FirstInst, Check);
  1886. }
  1887. std::pair<Instruction *, Instruction *>
  1888. LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
  1889. if (!PtrRtChecking->Need)
  1890. return std::make_pair(nullptr, nullptr);
  1891. return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
  1892. }
  1893. void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
  1894. Value *Ptr = nullptr;
  1895. if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
  1896. Ptr = LI->getPointerOperand();
  1897. else if (StoreInst *SI = dyn_cast<StoreInst>(MemAccess))
  1898. Ptr = SI->getPointerOperand();
  1899. else
  1900. return;
  1901. Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop);
  1902. if (!Stride)
  1903. return;
  1904. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that is a candidate for "
  1905. "versioning:");
  1906. LLVM_DEBUG(dbgs() << " Ptr: " << *Ptr << " Stride: " << *Stride << "\n");
  1907. // Avoid adding the "Stride == 1" predicate when we know that
  1908. // Stride >= Trip-Count. Such a predicate will effectively optimize a single
  1909. // or zero iteration loop, as Trip-Count <= Stride == 1.
  1910. //
  1911. // TODO: We are currently not making a very informed decision on when it is
  1912. // beneficial to apply stride versioning. It might make more sense that the
  1913. // users of this analysis (such as the vectorizer) will trigger it, based on
  1914. // their specific cost considerations; For example, in cases where stride
  1915. // versioning does not help resolving memory accesses/dependences, the
  1916. // vectorizer should evaluate the cost of the runtime test, and the benefit
  1917. // of various possible stride specializations, considering the alternatives
  1918. // of using gather/scatters (if available).
  1919. const SCEV *StrideExpr = PSE->getSCEV(Stride);
  1920. const SCEV *BETakenCount = PSE->getBackedgeTakenCount();
  1921. // Match the types so we can compare the stride and the BETakenCount.
  1922. // The Stride can be positive/negative, so we sign extend Stride;
  1923. // The backedgeTakenCount is non-negative, so we zero extend BETakenCount.
  1924. const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
  1925. uint64_t StrideTypeSize = DL.getTypeAllocSize(StrideExpr->getType());
  1926. uint64_t BETypeSize = DL.getTypeAllocSize(BETakenCount->getType());
  1927. const SCEV *CastedStride = StrideExpr;
  1928. const SCEV *CastedBECount = BETakenCount;
  1929. ScalarEvolution *SE = PSE->getSE();
  1930. if (BETypeSize >= StrideTypeSize)
  1931. CastedStride = SE->getNoopOrSignExtend(StrideExpr, BETakenCount->getType());
  1932. else
  1933. CastedBECount = SE->getZeroExtendExpr(BETakenCount, StrideExpr->getType());
  1934. const SCEV *StrideMinusBETaken = SE->getMinusSCEV(CastedStride, CastedBECount);
  1935. // Since TripCount == BackEdgeTakenCount + 1, checking:
  1936. // "Stride >= TripCount" is equivalent to checking:
  1937. // Stride - BETakenCount > 0
  1938. if (SE->isKnownPositive(StrideMinusBETaken)) {
  1939. LLVM_DEBUG(
  1940. dbgs() << "LAA: Stride>=TripCount; No point in versioning as the "
  1941. "Stride==1 predicate will imply that the loop executes "
  1942. "at most once.\n");
  1943. return;
  1944. }
  1945. LLVM_DEBUG(dbgs() << "LAA: Found a strided access that we can version.");
  1946. SymbolicStrides[Ptr] = Stride;
  1947. StrideSet.insert(Stride);
  1948. }
  1949. LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
  1950. const TargetLibraryInfo *TLI, AliasAnalysis *AA,
  1951. DominatorTree *DT, LoopInfo *LI)
  1952. : PSE(llvm::make_unique<PredicatedScalarEvolution>(*SE, *L)),
  1953. PtrRtChecking(llvm::make_unique<RuntimePointerChecking>(SE)),
  1954. DepChecker(llvm::make_unique<MemoryDepChecker>(*PSE, L)), TheLoop(L),
  1955. NumLoads(0), NumStores(0), MaxSafeDepDistBytes(-1), CanVecMem(false),
  1956. HasDependenceInvolvingLoopInvariantAddress(false) {
  1957. if (canAnalyzeLoop())
  1958. analyzeLoop(AA, LI, TLI, DT);
  1959. }
  1960. void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
  1961. if (CanVecMem) {
  1962. OS.indent(Depth) << "Memory dependences are safe";
  1963. if (MaxSafeDepDistBytes != -1ULL)
  1964. OS << " with a maximum dependence distance of " << MaxSafeDepDistBytes
  1965. << " bytes";
  1966. if (PtrRtChecking->Need)
  1967. OS << " with run-time checks";
  1968. OS << "\n";
  1969. }
  1970. if (Report)
  1971. OS.indent(Depth) << "Report: " << Report->getMsg() << "\n";
  1972. if (auto *Dependences = DepChecker->getDependences()) {
  1973. OS.indent(Depth) << "Dependences:\n";
  1974. for (auto &Dep : *Dependences) {
  1975. Dep.print(OS, Depth + 2, DepChecker->getMemoryInstructions());
  1976. OS << "\n";
  1977. }
  1978. } else
  1979. OS.indent(Depth) << "Too many dependences, not recorded\n";
  1980. // List the pair of accesses need run-time checks to prove independence.
  1981. PtrRtChecking->print(OS, Depth);
  1982. OS << "\n";
  1983. OS.indent(Depth) << "Non vectorizable stores to invariant address were "
  1984. << (HasDependenceInvolvingLoopInvariantAddress ? "" : "not ")
  1985. << "found in loop.\n";
  1986. OS.indent(Depth) << "SCEV assumptions:\n";
  1987. PSE->getUnionPredicate().print(OS, Depth);
  1988. OS << "\n";
  1989. OS.indent(Depth) << "Expressions re-written:\n";
  1990. PSE->print(OS, Depth);
  1991. }
  1992. const LoopAccessInfo &LoopAccessLegacyAnalysis::getInfo(Loop *L) {
  1993. auto &LAI = LoopAccessInfoMap[L];
  1994. if (!LAI)
  1995. LAI = llvm::make_unique<LoopAccessInfo>(L, SE, TLI, AA, DT, LI);
  1996. return *LAI.get();
  1997. }
  1998. void LoopAccessLegacyAnalysis::print(raw_ostream &OS, const Module *M) const {
  1999. LoopAccessLegacyAnalysis &LAA = *const_cast<LoopAccessLegacyAnalysis *>(this);
  2000. for (Loop *TopLevelLoop : *LI)
  2001. for (Loop *L : depth_first(TopLevelLoop)) {
  2002. OS.indent(2) << L->getHeader()->getName() << ":\n";
  2003. auto &LAI = LAA.getInfo(L);
  2004. LAI.print(OS, 4);
  2005. }
  2006. }
  2007. bool LoopAccessLegacyAnalysis::runOnFunction(Function &F) {
  2008. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  2009. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  2010. TLI = TLIP ? &TLIP->getTLI() : nullptr;
  2011. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  2012. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  2013. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  2014. return false;
  2015. }
  2016. void LoopAccessLegacyAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
  2017. AU.addRequired<ScalarEvolutionWrapperPass>();
  2018. AU.addRequired<AAResultsWrapperPass>();
  2019. AU.addRequired<DominatorTreeWrapperPass>();
  2020. AU.addRequired<LoopInfoWrapperPass>();
  2021. AU.setPreservesAll();
  2022. }
  2023. char LoopAccessLegacyAnalysis::ID = 0;
  2024. static const char laa_name[] = "Loop Access Analysis";
  2025. #define LAA_NAME "loop-accesses"
  2026. INITIALIZE_PASS_BEGIN(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2027. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  2028. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  2029. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  2030. INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
  2031. INITIALIZE_PASS_END(LoopAccessLegacyAnalysis, LAA_NAME, laa_name, false, true)
  2032. AnalysisKey LoopAccessAnalysis::Key;
  2033. LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM,
  2034. LoopStandardAnalysisResults &AR) {
  2035. return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI);
  2036. }
  2037. namespace llvm {
  2038. Pass *createLAAPass() {
  2039. return new LoopAccessLegacyAnalysis();
  2040. }
  2041. } // end namespace llvm