LoopVectorize.cpp 120 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196
  1. //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
  11. // and generates target-independent LLVM-IR. Legalization of the IR is done
  12. // in the codegen. However, the vectorizes uses (will use) the codegen
  13. // interfaces to generate IR that is likely to result in an optimal binary.
  14. //
  15. // The loop vectorizer combines consecutive loop iteration into a single
  16. // 'wide' iteration. After this transformation the index is incremented
  17. // by the SIMD vector width, and not by one.
  18. //
  19. // This pass has three parts:
  20. // 1. The main loop pass that drives the different parts.
  21. // 2. LoopVectorizationLegality - A unit that checks for the legality
  22. // of the vectorization.
  23. // 3. InnerLoopVectorizer - A unit that performs the actual
  24. // widening of instructions.
  25. // 4. LoopVectorizationCostModel - A unit that checks for the profitability
  26. // of vectorization. It decides on the optimal vector width, which
  27. // can be one, if vectorization is not profitable.
  28. //
  29. //===----------------------------------------------------------------------===//
  30. //
  31. // The reduction-variable vectorization is based on the paper:
  32. // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
  33. //
  34. // Variable uniformity checks are inspired by:
  35. // Karrenberg, R. and Hack, S. Whole Function Vectorization.
  36. //
  37. // Other ideas/concepts are from:
  38. // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
  39. //
  40. // S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
  41. // Vectorizing Compilers.
  42. //
  43. //===----------------------------------------------------------------------===//
  44. #define LV_NAME "loop-vectorize"
  45. #define DEBUG_TYPE LV_NAME
  46. #include "llvm/Transforms/Vectorize.h"
  47. #include "llvm/ADT/DenseMap.h"
  48. #include "llvm/ADT/MapVector.h"
  49. #include "llvm/ADT/SmallPtrSet.h"
  50. #include "llvm/ADT/SmallSet.h"
  51. #include "llvm/ADT/SmallVector.h"
  52. #include "llvm/ADT/StringExtras.h"
  53. #include "llvm/Analysis/AliasAnalysis.h"
  54. #include "llvm/Analysis/AliasSetTracker.h"
  55. #include "llvm/Analysis/Dominators.h"
  56. #include "llvm/Analysis/LoopInfo.h"
  57. #include "llvm/Analysis/LoopIterator.h"
  58. #include "llvm/Analysis/LoopPass.h"
  59. #include "llvm/Analysis/ScalarEvolution.h"
  60. #include "llvm/Analysis/ScalarEvolutionExpander.h"
  61. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  62. #include "llvm/Analysis/TargetTransformInfo.h"
  63. #include "llvm/Analysis/ValueTracking.h"
  64. #include "llvm/Analysis/Verifier.h"
  65. #include "llvm/IR/Constants.h"
  66. #include "llvm/IR/DataLayout.h"
  67. #include "llvm/IR/DerivedTypes.h"
  68. #include "llvm/IR/Function.h"
  69. #include "llvm/IR/IRBuilder.h"
  70. #include "llvm/IR/Instructions.h"
  71. #include "llvm/IR/IntrinsicInst.h"
  72. #include "llvm/IR/LLVMContext.h"
  73. #include "llvm/IR/Module.h"
  74. #include "llvm/IR/Type.h"
  75. #include "llvm/IR/Value.h"
  76. #include "llvm/Pass.h"
  77. #include "llvm/Support/CommandLine.h"
  78. #include "llvm/Support/Debug.h"
  79. #include "llvm/Support/raw_ostream.h"
  80. #include "llvm/Transforms/Scalar.h"
  81. #include "llvm/Transforms/Utils/BasicBlockUtils.h"
  82. #include "llvm/Transforms/Utils/Local.h"
  83. #include <algorithm>
  84. #include <map>
  85. using namespace llvm;
  86. static cl::opt<unsigned>
  87. VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
  88. cl::desc("Sets the SIMD width. Zero is autoselect."));
  89. static cl::opt<unsigned>
  90. VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
  91. cl::desc("Sets the vectorization unroll count. "
  92. "Zero is autoselect."));
  93. static cl::opt<bool>
  94. EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
  95. cl::desc("Enable if-conversion during vectorization."));
  96. /// We don't vectorize loops with a known constant trip count below this number.
  97. static const unsigned TinyTripCountVectorThreshold = 16;
  98. /// We don't unroll loops with a known constant trip count below this number.
  99. static const unsigned TinyTripCountUnrollThreshold = 128;
  100. /// When performing a runtime memory check, do not check more than this
  101. /// number of pointers. Notice that the check is quadratic!
  102. static const unsigned RuntimeMemoryCheckThreshold = 4;
  103. namespace {
  104. // Forward declarations.
  105. class LoopVectorizationLegality;
  106. class LoopVectorizationCostModel;
  107. /// InnerLoopVectorizer vectorizes loops which contain only one basic
  108. /// block to a specified vectorization factor (VF).
  109. /// This class performs the widening of scalars into vectors, or multiple
  110. /// scalars. This class also implements the following features:
  111. /// * It inserts an epilogue loop for handling loops that don't have iteration
  112. /// counts that are known to be a multiple of the vectorization factor.
  113. /// * It handles the code generation for reduction variables.
  114. /// * Scalarization (implementation using scalars) of un-vectorizable
  115. /// instructions.
  116. /// InnerLoopVectorizer does not perform any vectorization-legality
  117. /// checks, and relies on the caller to check for the different legality
  118. /// aspects. The InnerLoopVectorizer relies on the
  119. /// LoopVectorizationLegality class to provide information about the induction
  120. /// and reduction variables that were found to a given vectorization factor.
  121. class InnerLoopVectorizer {
  122. public:
  123. InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
  124. DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
  125. unsigned UnrollFactor)
  126. : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
  127. UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
  128. OldInduction(0), WidenMap(UnrollFactor) {}
  129. // Perform the actual loop widening (vectorization).
  130. void vectorize(LoopVectorizationLegality *Legal) {
  131. // Create a new empty loop. Unlink the old loop and connect the new one.
  132. createEmptyLoop(Legal);
  133. // Widen each instruction in the old loop to a new one in the new loop.
  134. // Use the Legality module to find the induction and reduction variables.
  135. vectorizeLoop(Legal);
  136. // Register the new loop and update the analysis passes.
  137. updateAnalysis();
  138. }
  139. private:
  140. /// A small list of PHINodes.
  141. typedef SmallVector<PHINode*, 4> PhiVector;
  142. /// When we unroll loops we have multiple vector values for each scalar.
  143. /// This data structure holds the unrolled and vectorized values that
  144. /// originated from one scalar instruction.
  145. typedef SmallVector<Value*, 2> VectorParts;
  146. /// Add code that checks at runtime if the accessed arrays overlap.
  147. /// Returns the comparator value or NULL if no check is needed.
  148. Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
  149. Instruction *Loc);
  150. /// Create an empty loop, based on the loop ranges of the old loop.
  151. void createEmptyLoop(LoopVectorizationLegality *Legal);
  152. /// Copy and widen the instructions from the old loop.
  153. void vectorizeLoop(LoopVectorizationLegality *Legal);
  154. /// A helper function that computes the predicate of the block BB, assuming
  155. /// that the header block of the loop is set to True. It returns the *entry*
  156. /// mask for the block BB.
  157. VectorParts createBlockInMask(BasicBlock *BB);
  158. /// A helper function that computes the predicate of the edge between SRC
  159. /// and DST.
  160. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
  161. /// A helper function to vectorize a single BB within the innermost loop.
  162. void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
  163. PhiVector *PV);
  164. /// Insert the new loop to the loop hierarchy and pass manager
  165. /// and update the analysis passes.
  166. void updateAnalysis();
  167. /// This instruction is un-vectorizable. Implement it as a sequence
  168. /// of scalars.
  169. void scalarizeInstruction(Instruction *Instr);
  170. /// Create a broadcast instruction. This method generates a broadcast
  171. /// instruction (shuffle) for loop invariant values and for the induction
  172. /// value. If this is the induction variable then we extend it to N, N+1, ...
  173. /// this is needed because each iteration in the loop corresponds to a SIMD
  174. /// element.
  175. Value *getBroadcastInstrs(Value *V);
  176. /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
  177. /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
  178. /// The sequence starts at StartIndex.
  179. Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
  180. /// When we go over instructions in the basic block we rely on previous
  181. /// values within the current basic block or on loop invariant values.
  182. /// When we widen (vectorize) values we place them in the map. If the values
  183. /// are not within the map, they have to be loop invariant, so we simply
  184. /// broadcast them into a vector.
  185. VectorParts &getVectorValue(Value *V);
  186. /// Generate a shuffle sequence that will reverse the vector Vec.
  187. Value *reverseVector(Value *Vec);
  188. /// This is a helper class that holds the vectorizer state. It maps scalar
  189. /// instructions to vector instructions. When the code is 'unrolled' then
  190. /// then a single scalar value is mapped to multiple vector parts. The parts
  191. /// are stored in the VectorPart type.
  192. struct ValueMap {
  193. /// C'tor. UnrollFactor controls the number of vectors ('parts') that
  194. /// are mapped.
  195. ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
  196. /// \return True if 'Key' is saved in the Value Map.
  197. bool has(Value *Key) { return MapStoreage.count(Key); }
  198. /// Initializes a new entry in the map. Sets all of the vector parts to the
  199. /// save value in 'Val'.
  200. /// \return A reference to a vector with splat values.
  201. VectorParts &splat(Value *Key, Value *Val) {
  202. MapStoreage[Key].clear();
  203. MapStoreage[Key].append(UF, Val);
  204. return MapStoreage[Key];
  205. }
  206. ///\return A reference to the value that is stored at 'Key'.
  207. VectorParts &get(Value *Key) {
  208. if (!has(Key))
  209. MapStoreage[Key].resize(UF);
  210. return MapStoreage[Key];
  211. }
  212. /// The unroll factor. Each entry in the map stores this number of vector
  213. /// elements.
  214. unsigned UF;
  215. /// Map storage. We use std::map and not DenseMap because insertions to a
  216. /// dense map invalidates its iterators.
  217. std::map<Value*, VectorParts> MapStoreage;
  218. };
  219. /// The original loop.
  220. Loop *OrigLoop;
  221. /// Scev analysis to use.
  222. ScalarEvolution *SE;
  223. /// Loop Info.
  224. LoopInfo *LI;
  225. /// Dominator Tree.
  226. DominatorTree *DT;
  227. /// Data Layout.
  228. DataLayout *DL;
  229. /// The vectorization SIMD factor to use. Each vector will have this many
  230. /// vector elements.
  231. unsigned VF;
  232. /// The vectorization unroll factor to use. Each scalar is vectorized to this
  233. /// many different vector instructions.
  234. unsigned UF;
  235. /// The builder that we use
  236. IRBuilder<> Builder;
  237. // --- Vectorization state ---
  238. /// The vector-loop preheader.
  239. BasicBlock *LoopVectorPreHeader;
  240. /// The scalar-loop preheader.
  241. BasicBlock *LoopScalarPreHeader;
  242. /// Middle Block between the vector and the scalar.
  243. BasicBlock *LoopMiddleBlock;
  244. ///The ExitBlock of the scalar loop.
  245. BasicBlock *LoopExitBlock;
  246. ///The vector loop body.
  247. BasicBlock *LoopVectorBody;
  248. ///The scalar loop body.
  249. BasicBlock *LoopScalarBody;
  250. /// A list of all bypass blocks. The first block is the entry of the loop.
  251. SmallVector<BasicBlock *, 4> LoopBypassBlocks;
  252. /// The new Induction variable which was added to the new block.
  253. PHINode *Induction;
  254. /// The induction variable of the old basic block.
  255. PHINode *OldInduction;
  256. /// Maps scalars to widened vectors.
  257. ValueMap WidenMap;
  258. };
  259. /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
  260. /// to what vectorization factor.
  261. /// This class does not look at the profitability of vectorization, only the
  262. /// legality. This class has two main kinds of checks:
  263. /// * Memory checks - The code in canVectorizeMemory checks if vectorization
  264. /// will change the order of memory accesses in a way that will change the
  265. /// correctness of the program.
  266. /// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
  267. /// checks for a number of different conditions, such as the availability of a
  268. /// single induction variable, that all types are supported and vectorize-able,
  269. /// etc. This code reflects the capabilities of InnerLoopVectorizer.
  270. /// This class is also used by InnerLoopVectorizer for identifying
  271. /// induction variable and the different reduction variables.
  272. class LoopVectorizationLegality {
  273. public:
  274. LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
  275. DominatorTree *DT)
  276. : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
  277. /// This enum represents the kinds of reductions that we support.
  278. enum ReductionKind {
  279. RK_NoReduction, ///< Not a reduction.
  280. RK_IntegerAdd, ///< Sum of integers.
  281. RK_IntegerMult, ///< Product of integers.
  282. RK_IntegerOr, ///< Bitwise or logical OR of numbers.
  283. RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
  284. RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
  285. RK_FloatAdd, ///< Sum of floats.
  286. RK_FloatMult ///< Product of floats.
  287. };
  288. /// This enum represents the kinds of inductions that we support.
  289. enum InductionKind {
  290. IK_NoInduction, ///< Not an induction variable.
  291. IK_IntInduction, ///< Integer induction variable. Step = 1.
  292. IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
  293. IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem).
  294. };
  295. /// This POD struct holds information about reduction variables.
  296. struct ReductionDescriptor {
  297. ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
  298. Kind(RK_NoReduction) {}
  299. ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
  300. : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
  301. // The starting value of the reduction.
  302. // It does not have to be zero!
  303. Value *StartValue;
  304. // The instruction who's value is used outside the loop.
  305. Instruction *LoopExitInstr;
  306. // The kind of the reduction.
  307. ReductionKind Kind;
  308. };
  309. // This POD struct holds information about the memory runtime legality
  310. // check that a group of pointers do not overlap.
  311. struct RuntimePointerCheck {
  312. RuntimePointerCheck() : Need(false) {}
  313. /// Reset the state of the pointer runtime information.
  314. void reset() {
  315. Need = false;
  316. Pointers.clear();
  317. Starts.clear();
  318. Ends.clear();
  319. }
  320. /// Insert a pointer and calculate the start and end SCEVs.
  321. void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
  322. /// This flag indicates if we need to add the runtime check.
  323. bool Need;
  324. /// Holds the pointers that we need to check.
  325. SmallVector<Value*, 2> Pointers;
  326. /// Holds the pointer value at the beginning of the loop.
  327. SmallVector<const SCEV*, 2> Starts;
  328. /// Holds the pointer value at the end of the loop.
  329. SmallVector<const SCEV*, 2> Ends;
  330. };
  331. /// A POD for saving information about induction variables.
  332. struct InductionInfo {
  333. InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
  334. InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
  335. /// Start value.
  336. Value *StartValue;
  337. /// Induction kind.
  338. InductionKind IK;
  339. };
  340. /// ReductionList contains the reduction descriptors for all
  341. /// of the reductions that were found in the loop.
  342. typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
  343. /// InductionList saves induction variables and maps them to the
  344. /// induction descriptor.
  345. typedef MapVector<PHINode*, InductionInfo> InductionList;
  346. /// Returns true if it is legal to vectorize this loop.
  347. /// This does not mean that it is profitable to vectorize this
  348. /// loop, only that it is legal to do so.
  349. bool canVectorize();
  350. /// Returns the Induction variable.
  351. PHINode *getInduction() { return Induction; }
  352. /// Returns the reduction variables found in the loop.
  353. ReductionList *getReductionVars() { return &Reductions; }
  354. /// Returns the induction variables found in the loop.
  355. InductionList *getInductionVars() { return &Inductions; }
  356. /// Returns True if V is an induction variable in this loop.
  357. bool isInductionVariable(const Value *V);
  358. /// Return true if the block BB needs to be predicated in order for the loop
  359. /// to be vectorized.
  360. bool blockNeedsPredication(BasicBlock *BB);
  361. /// Check if this pointer is consecutive when vectorizing. This happens
  362. /// when the last index of the GEP is the induction variable, or that the
  363. /// pointer itself is an induction variable.
  364. /// This check allows us to vectorize A[idx] into a wide load/store.
  365. /// Returns:
  366. /// 0 - Stride is unknown or non consecutive.
  367. /// 1 - Address is consecutive.
  368. /// -1 - Address is consecutive, and decreasing.
  369. int isConsecutivePtr(Value *Ptr);
  370. /// Returns true if the value V is uniform within the loop.
  371. bool isUniform(Value *V);
  372. /// Returns true if this instruction will remain scalar after vectorization.
  373. bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
  374. /// Returns the information that we collected about runtime memory check.
  375. RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
  376. private:
  377. /// Check if a single basic block loop is vectorizable.
  378. /// At this point we know that this is a loop with a constant trip count
  379. /// and we only need to check individual instructions.
  380. bool canVectorizeInstrs();
  381. /// When we vectorize loops we may change the order in which
  382. /// we read and write from memory. This method checks if it is
  383. /// legal to vectorize the code, considering only memory constrains.
  384. /// Returns true if the loop is vectorizable
  385. bool canVectorizeMemory();
  386. /// Return true if we can vectorize this loop using the IF-conversion
  387. /// transformation.
  388. bool canVectorizeWithIfConvert();
  389. /// Collect the variables that need to stay uniform after vectorization.
  390. void collectLoopUniforms();
  391. /// Return true if all of the instructions in the block can be speculatively
  392. /// executed.
  393. bool blockCanBePredicated(BasicBlock *BB);
  394. /// Returns True, if 'Phi' is the kind of reduction variable for type
  395. /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
  396. bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
  397. /// Returns true if the instruction I can be a reduction variable of type
  398. /// 'Kind'.
  399. bool isReductionInstr(Instruction *I, ReductionKind Kind);
  400. /// Returns the induction kind of Phi. This function may return NoInduction
  401. /// if the PHI is not an induction variable.
  402. InductionKind isInductionVariable(PHINode *Phi);
  403. /// Return true if can compute the address bounds of Ptr within the loop.
  404. bool hasComputableBounds(Value *Ptr);
  405. /// The loop that we evaluate.
  406. Loop *TheLoop;
  407. /// Scev analysis.
  408. ScalarEvolution *SE;
  409. /// DataLayout analysis.
  410. DataLayout *DL;
  411. // Dominators.
  412. DominatorTree *DT;
  413. // --- vectorization state --- //
  414. /// Holds the integer induction variable. This is the counter of the
  415. /// loop.
  416. PHINode *Induction;
  417. /// Holds the reduction variables.
  418. ReductionList Reductions;
  419. /// Holds all of the induction variables that we found in the loop.
  420. /// Notice that inductions don't need to start at zero and that induction
  421. /// variables can be pointers.
  422. InductionList Inductions;
  423. /// Allowed outside users. This holds the reduction
  424. /// vars which can be accessed from outside the loop.
  425. SmallPtrSet<Value*, 4> AllowedExit;
  426. /// This set holds the variables which are known to be uniform after
  427. /// vectorization.
  428. SmallPtrSet<Instruction*, 4> Uniforms;
  429. /// We need to check that all of the pointers in this list are disjoint
  430. /// at runtime.
  431. RuntimePointerCheck PtrRtCheck;
  432. };
  433. /// LoopVectorizationCostModel - estimates the expected speedups due to
  434. /// vectorization.
  435. /// In many cases vectorization is not profitable. This can happen because of
  436. /// a number of reasons. In this class we mainly attempt to predict the
  437. /// expected speedup/slowdowns due to the supported instruction set. We use the
  438. /// TargetTransformInfo to query the different backends for the cost of
  439. /// different operations.
  440. class LoopVectorizationCostModel {
  441. public:
  442. LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
  443. LoopVectorizationLegality *Legal,
  444. const TargetTransformInfo &TTI)
  445. : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
  446. /// \return The most profitable vectorization factor and the cost of that VF.
  447. /// This method checks every power of two up to VF. If UserVF is not ZERO
  448. /// then this vectorization factor will be selected if vectorization is
  449. /// possible.
  450. std::pair<unsigned, unsigned>
  451. selectVectorizationFactor(bool OptForSize, unsigned UserVF);
  452. /// \returns The size (in bits) of the widest type in the code that
  453. /// needs to be vectorized. We ignore values that remain scalar such as
  454. /// 64 bit loop indices.
  455. unsigned getWidestType();
  456. /// \return The most profitable unroll factor.
  457. /// If UserUF is non-zero then this method finds the best unroll-factor
  458. /// based on register pressure and other parameters.
  459. /// VF and LoopCost are the selected vectorization factor and the cost of the
  460. /// selected VF.
  461. unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
  462. unsigned LoopCost);
  463. /// \brief A struct that represents some properties of the register usage
  464. /// of a loop.
  465. struct RegisterUsage {
  466. /// Holds the number of loop invariant values that are used in the loop.
  467. unsigned LoopInvariantRegs;
  468. /// Holds the maximum number of concurrent live intervals in the loop.
  469. unsigned MaxLocalUsers;
  470. /// Holds the number of instructions in the loop.
  471. unsigned NumInstructions;
  472. };
  473. /// \return information about the register usage of the loop.
  474. RegisterUsage calculateRegisterUsage();
  475. private:
  476. /// Returns the expected execution cost. The unit of the cost does
  477. /// not matter because we use the 'cost' units to compare different
  478. /// vector widths. The cost that is returned is *not* normalized by
  479. /// the factor width.
  480. unsigned expectedCost(unsigned VF);
  481. /// Returns the execution time cost of an instruction for a given vector
  482. /// width. Vector width of one means scalar.
  483. unsigned getInstructionCost(Instruction *I, unsigned VF);
  484. /// A helper function for converting Scalar types to vector types.
  485. /// If the incoming type is void, we return void. If the VF is 1, we return
  486. /// the scalar type.
  487. static Type* ToVectorTy(Type *Scalar, unsigned VF);
  488. /// The loop that we evaluate.
  489. Loop *TheLoop;
  490. /// Scev analysis.
  491. ScalarEvolution *SE;
  492. /// Loop Info analysis.
  493. LoopInfo *LI;
  494. /// Vectorization legality.
  495. LoopVectorizationLegality *Legal;
  496. /// Vector target information.
  497. const TargetTransformInfo &TTI;
  498. };
  499. /// The LoopVectorize Pass.
  500. struct LoopVectorize : public LoopPass {
  501. /// Pass identification, replacement for typeid
  502. static char ID;
  503. explicit LoopVectorize() : LoopPass(ID) {
  504. initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
  505. }
  506. ScalarEvolution *SE;
  507. DataLayout *DL;
  508. LoopInfo *LI;
  509. TargetTransformInfo *TTI;
  510. DominatorTree *DT;
  511. virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
  512. // We only vectorize innermost loops.
  513. if (!L->empty())
  514. return false;
  515. SE = &getAnalysis<ScalarEvolution>();
  516. DL = getAnalysisIfAvailable<DataLayout>();
  517. LI = &getAnalysis<LoopInfo>();
  518. TTI = &getAnalysis<TargetTransformInfo>();
  519. DT = &getAnalysis<DominatorTree>();
  520. DEBUG(dbgs() << "LV: Checking a loop in \"" <<
  521. L->getHeader()->getParent()->getName() << "\"\n");
  522. // Check if it is legal to vectorize the loop.
  523. LoopVectorizationLegality LVL(L, SE, DL, DT);
  524. if (!LVL.canVectorize()) {
  525. DEBUG(dbgs() << "LV: Not vectorizing.\n");
  526. return false;
  527. }
  528. // Use the cost model.
  529. LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
  530. // Check the function attribues to find out if this function should be
  531. // optimized for size.
  532. Function *F = L->getHeader()->getParent();
  533. Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
  534. Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
  535. unsigned FnIndex = AttributeSet::FunctionIndex;
  536. bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
  537. bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
  538. if (NoFloat) {
  539. DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
  540. "attribute is used.\n");
  541. return false;
  542. }
  543. // Select the optimal vectorization factor.
  544. std::pair<unsigned, unsigned> VFPair;
  545. VFPair = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
  546. // Select the unroll factor.
  547. unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
  548. VFPair.first, VFPair.second);
  549. unsigned VF = VFPair.first;
  550. if (VF == 1) {
  551. DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
  552. return false;
  553. }
  554. DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
  555. F->getParent()->getModuleIdentifier()<<"\n");
  556. DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
  557. // If we decided that it is *legal* to vectorizer the loop then do it.
  558. InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
  559. LB.vectorize(&LVL);
  560. DEBUG(verifyFunction(*L->getHeader()->getParent()));
  561. return true;
  562. }
  563. virtual void getAnalysisUsage(AnalysisUsage &AU) const {
  564. LoopPass::getAnalysisUsage(AU);
  565. AU.addRequiredID(LoopSimplifyID);
  566. AU.addRequiredID(LCSSAID);
  567. AU.addRequired<DominatorTree>();
  568. AU.addRequired<LoopInfo>();
  569. AU.addRequired<ScalarEvolution>();
  570. AU.addRequired<TargetTransformInfo>();
  571. AU.addPreserved<LoopInfo>();
  572. AU.addPreserved<DominatorTree>();
  573. }
  574. };
  575. } // end anonymous namespace
  576. //===----------------------------------------------------------------------===//
  577. // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
  578. // LoopVectorizationCostModel.
  579. //===----------------------------------------------------------------------===//
  580. void
  581. LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
  582. Loop *Lp, Value *Ptr) {
  583. const SCEV *Sc = SE->getSCEV(Ptr);
  584. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
  585. assert(AR && "Invalid addrec expression");
  586. const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
  587. const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
  588. Pointers.push_back(Ptr);
  589. Starts.push_back(AR->getStart());
  590. Ends.push_back(ScEnd);
  591. }
  592. Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
  593. // Save the current insertion location.
  594. Instruction *Loc = Builder.GetInsertPoint();
  595. // We need to place the broadcast of invariant variables outside the loop.
  596. Instruction *Instr = dyn_cast<Instruction>(V);
  597. bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
  598. bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
  599. // Place the code for broadcasting invariant variables in the new preheader.
  600. if (Invariant)
  601. Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
  602. // Broadcast the scalar into all locations in the vector.
  603. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
  604. // Restore the builder insertion point.
  605. if (Invariant)
  606. Builder.SetInsertPoint(Loc);
  607. return Shuf;
  608. }
  609. Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
  610. bool Negate) {
  611. assert(Val->getType()->isVectorTy() && "Must be a vector");
  612. assert(Val->getType()->getScalarType()->isIntegerTy() &&
  613. "Elem must be an integer");
  614. // Create the types.
  615. Type *ITy = Val->getType()->getScalarType();
  616. VectorType *Ty = cast<VectorType>(Val->getType());
  617. int VLen = Ty->getNumElements();
  618. SmallVector<Constant*, 8> Indices;
  619. // Create a vector of consecutive numbers from zero to VF.
  620. for (int i = 0; i < VLen; ++i) {
  621. int Idx = Negate ? (-i): i;
  622. Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
  623. }
  624. // Add the consecutive indices to the vector value.
  625. Constant *Cv = ConstantVector::get(Indices);
  626. assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
  627. return Builder.CreateAdd(Val, Cv, "induction");
  628. }
  629. int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
  630. assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
  631. // If this value is a pointer induction variable we know it is consecutive.
  632. PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
  633. if (Phi && Inductions.count(Phi)) {
  634. InductionInfo II = Inductions[Phi];
  635. if (IK_PtrInduction == II.IK)
  636. return 1;
  637. }
  638. GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
  639. if (!Gep)
  640. return 0;
  641. unsigned NumOperands = Gep->getNumOperands();
  642. Value *LastIndex = Gep->getOperand(NumOperands - 1);
  643. // Check that all of the gep indices are uniform except for the last.
  644. for (unsigned i = 0; i < NumOperands - 1; ++i)
  645. if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
  646. return 0;
  647. // We can emit wide load/stores only if the last index is the induction
  648. // variable.
  649. const SCEV *Last = SE->getSCEV(LastIndex);
  650. if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
  651. const SCEV *Step = AR->getStepRecurrence(*SE);
  652. // The memory is consecutive because the last index is consecutive
  653. // and all other indices are loop invariant.
  654. if (Step->isOne())
  655. return 1;
  656. if (Step->isAllOnesValue())
  657. return -1;
  658. }
  659. return 0;
  660. }
  661. bool LoopVectorizationLegality::isUniform(Value *V) {
  662. return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
  663. }
  664. InnerLoopVectorizer::VectorParts&
  665. InnerLoopVectorizer::getVectorValue(Value *V) {
  666. assert(V != Induction && "The new induction variable should not be used.");
  667. assert(!V->getType()->isVectorTy() && "Can't widen a vector");
  668. // If we have this scalar in the map, return it.
  669. if (WidenMap.has(V))
  670. return WidenMap.get(V);
  671. // If this scalar is unknown, assume that it is a constant or that it is
  672. // loop invariant. Broadcast V and save the value for future uses.
  673. Value *B = getBroadcastInstrs(V);
  674. WidenMap.splat(V, B);
  675. return WidenMap.get(V);
  676. }
  677. Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
  678. assert(Vec->getType()->isVectorTy() && "Invalid type");
  679. SmallVector<Constant*, 8> ShuffleMask;
  680. for (unsigned i = 0; i < VF; ++i)
  681. ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
  682. return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
  683. ConstantVector::get(ShuffleMask),
  684. "reverse");
  685. }
  686. void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
  687. assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
  688. // Holds vector parameters or scalars, in case of uniform vals.
  689. SmallVector<VectorParts, 4> Params;
  690. // Find all of the vectorized parameters.
  691. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
  692. Value *SrcOp = Instr->getOperand(op);
  693. // If we are accessing the old induction variable, use the new one.
  694. if (SrcOp == OldInduction) {
  695. Params.push_back(getVectorValue(SrcOp));
  696. continue;
  697. }
  698. // Try using previously calculated values.
  699. Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
  700. // If the src is an instruction that appeared earlier in the basic block
  701. // then it should already be vectorized.
  702. if (SrcInst && OrigLoop->contains(SrcInst)) {
  703. assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
  704. // The parameter is a vector value from earlier.
  705. Params.push_back(WidenMap.get(SrcInst));
  706. } else {
  707. // The parameter is a scalar from outside the loop. Maybe even a constant.
  708. VectorParts Scalars;
  709. Scalars.append(UF, SrcOp);
  710. Params.push_back(Scalars);
  711. }
  712. }
  713. assert(Params.size() == Instr->getNumOperands() &&
  714. "Invalid number of operands");
  715. // Does this instruction return a value ?
  716. bool IsVoidRetTy = Instr->getType()->isVoidTy();
  717. Value *UndefVec = IsVoidRetTy ? 0 :
  718. UndefValue::get(VectorType::get(Instr->getType(), VF));
  719. // Create a new entry in the WidenMap and initialize it to Undef or Null.
  720. VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
  721. // For each scalar that we create:
  722. for (unsigned Width = 0; Width < VF; ++Width) {
  723. // For each vector unroll 'part':
  724. for (unsigned Part = 0; Part < UF; ++Part) {
  725. Instruction *Cloned = Instr->clone();
  726. if (!IsVoidRetTy)
  727. Cloned->setName(Instr->getName() + ".cloned");
  728. // Replace the operands of the cloned instrucions with extracted scalars.
  729. for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
  730. Value *Op = Params[op][Part];
  731. // Param is a vector. Need to extract the right lane.
  732. if (Op->getType()->isVectorTy())
  733. Op = Builder.CreateExtractElement(Op, Builder.getInt32(Width));
  734. Cloned->setOperand(op, Op);
  735. }
  736. // Place the cloned scalar in the new loop.
  737. Builder.Insert(Cloned);
  738. // If the original scalar returns a value we need to place it in a vector
  739. // so that future users will be able to use it.
  740. if (!IsVoidRetTy)
  741. VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
  742. Builder.getInt32(Width));
  743. }
  744. }
  745. }
  746. Instruction *
  747. InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
  748. Instruction *Loc) {
  749. LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
  750. Legal->getRuntimePointerCheck();
  751. if (!PtrRtCheck->Need)
  752. return NULL;
  753. Instruction *MemoryRuntimeCheck = 0;
  754. unsigned NumPointers = PtrRtCheck->Pointers.size();
  755. SmallVector<Value* , 2> Starts;
  756. SmallVector<Value* , 2> Ends;
  757. SCEVExpander Exp(*SE, "induction");
  758. // Use this type for pointer arithmetic.
  759. Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
  760. for (unsigned i = 0; i < NumPointers; ++i) {
  761. Value *Ptr = PtrRtCheck->Pointers[i];
  762. const SCEV *Sc = SE->getSCEV(Ptr);
  763. if (SE->isLoopInvariant(Sc, OrigLoop)) {
  764. DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
  765. *Ptr <<"\n");
  766. Starts.push_back(Ptr);
  767. Ends.push_back(Ptr);
  768. } else {
  769. DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
  770. Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
  771. Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
  772. Starts.push_back(Start);
  773. Ends.push_back(End);
  774. }
  775. }
  776. for (unsigned i = 0; i < NumPointers; ++i) {
  777. for (unsigned j = i+1; j < NumPointers; ++j) {
  778. Instruction::CastOps Op = Instruction::BitCast;
  779. Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
  780. Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
  781. Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc);
  782. Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc);
  783. Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
  784. Start0, End1, "bound0", Loc);
  785. Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
  786. Start1, End0, "bound1", Loc);
  787. Instruction *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0,
  788. Cmp1, "found.conflict",
  789. Loc);
  790. if (MemoryRuntimeCheck)
  791. MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
  792. MemoryRuntimeCheck,
  793. IsConflict,
  794. "conflict.rdx", Loc);
  795. else
  796. MemoryRuntimeCheck = IsConflict;
  797. }
  798. }
  799. return MemoryRuntimeCheck;
  800. }
  801. void
  802. InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
  803. /*
  804. In this function we generate a new loop. The new loop will contain
  805. the vectorized instructions while the old loop will continue to run the
  806. scalar remainder.
  807. [ ] <-- vector loop bypass (may consist of multiple blocks).
  808. / |
  809. / v
  810. | [ ] <-- vector pre header.
  811. | |
  812. | v
  813. | [ ] \
  814. | [ ]_| <-- vector loop.
  815. | |
  816. \ v
  817. >[ ] <--- middle-block.
  818. / |
  819. / v
  820. | [ ] <--- new preheader.
  821. | |
  822. | v
  823. | [ ] \
  824. | [ ]_| <-- old scalar loop to handle remainder.
  825. \ |
  826. \ v
  827. >[ ] <-- exit block.
  828. ...
  829. */
  830. BasicBlock *OldBasicBlock = OrigLoop->getHeader();
  831. BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
  832. BasicBlock *ExitBlock = OrigLoop->getExitBlock();
  833. assert(ExitBlock && "Must have an exit block");
  834. // Some loops have a single integer induction variable, while other loops
  835. // don't. One example is c++ iterators that often have multiple pointer
  836. // induction variables. In the code below we also support a case where we
  837. // don't have a single induction variable.
  838. OldInduction = Legal->getInduction();
  839. Type *IdxTy = OldInduction ? OldInduction->getType() :
  840. DL->getIntPtrType(SE->getContext());
  841. // Find the loop boundaries.
  842. const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
  843. assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
  844. // Get the total trip count from the count by adding 1.
  845. ExitCount = SE->getAddExpr(ExitCount,
  846. SE->getConstant(ExitCount->getType(), 1));
  847. // Expand the trip count and place the new instructions in the preheader.
  848. // Notice that the pre-header does not change, only the loop body.
  849. SCEVExpander Exp(*SE, "induction");
  850. // Count holds the overall loop count (N).
  851. Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
  852. BypassBlock->getTerminator());
  853. // The loop index does not have to start at Zero. Find the original start
  854. // value from the induction PHI node. If we don't have an induction variable
  855. // then we know that it starts at zero.
  856. Value *StartIdx = OldInduction ?
  857. OldInduction->getIncomingValueForBlock(BypassBlock):
  858. ConstantInt::get(IdxTy, 0);
  859. assert(BypassBlock && "Invalid loop structure");
  860. LoopBypassBlocks.push_back(BypassBlock);
  861. // Split the single block loop into the two loop structure described above.
  862. BasicBlock *VectorPH =
  863. BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
  864. BasicBlock *VecBody =
  865. VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
  866. BasicBlock *MiddleBlock =
  867. VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
  868. BasicBlock *ScalarPH =
  869. MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
  870. // This is the location in which we add all of the logic for bypassing
  871. // the new vector loop.
  872. Instruction *Loc = BypassBlock->getTerminator();
  873. // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
  874. // inside the loop.
  875. Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
  876. // Generate the induction variable.
  877. Induction = Builder.CreatePHI(IdxTy, 2, "index");
  878. // The loop step is equal to the vectorization factor (num of SIMD elements)
  879. // times the unroll factor (num of SIMD instructions).
  880. Constant *Step = ConstantInt::get(IdxTy, VF * UF);
  881. // We may need to extend the index in case there is a type mismatch.
  882. // We know that the count starts at zero and does not overflow.
  883. unsigned IdxTyBW = IdxTy->getScalarSizeInBits();
  884. if (Count->getType() != IdxTy) {
  885. // The exit count can be of pointer type. Convert it to the correct
  886. // integer type.
  887. if (ExitCount->getType()->isPointerTy())
  888. Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
  889. else if (IdxTyBW < Count->getType()->getScalarSizeInBits())
  890. Count = CastInst::CreateTruncOrBitCast(Count, IdxTy, "tr.cnt", Loc);
  891. else
  892. Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
  893. }
  894. // Add the start index to the loop count to get the new end index.
  895. Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
  896. // Now we need to generate the expression for N - (N % VF), which is
  897. // the part that the vectorized body will execute.
  898. Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
  899. Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
  900. Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
  901. "end.idx.rnd.down", Loc);
  902. // Now, compare the new count to zero. If it is zero skip the vector loop and
  903. // jump to the scalar loop.
  904. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
  905. IdxEndRoundDown,
  906. StartIdx,
  907. "cmp.zero", Loc);
  908. // Generate the code that checks in runtime if arrays overlap. We put the
  909. // checks into a separate block to make the more common case of few elements
  910. // faster.
  911. if (Instruction *MemoryRuntimeCheck = addRuntimeCheck(Legal, Loc)) {
  912. // Create a new block containing the memory check.
  913. BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemoryRuntimeCheck,
  914. "vector.memcheck");
  915. LoopBypassBlocks.push_back(CheckBlock);
  916. // Replace the branch into the memory check block with a conditional branch
  917. // for the "few elements case".
  918. Instruction *OldTerm = BypassBlock->getTerminator();
  919. BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
  920. OldTerm->eraseFromParent();
  921. Cmp = MemoryRuntimeCheck;
  922. assert(Loc == CheckBlock->getTerminator());
  923. }
  924. BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
  925. // Remove the old terminator.
  926. Loc->eraseFromParent();
  927. // We are going to resume the execution of the scalar loop.
  928. // Go over all of the induction variables that we found and fix the
  929. // PHIs that are left in the scalar version of the loop.
  930. // The starting values of PHI nodes depend on the counter of the last
  931. // iteration in the vectorized loop.
  932. // If we come from a bypass edge then we need to start from the original
  933. // start value.
  934. // This variable saves the new starting index for the scalar loop.
  935. PHINode *ResumeIndex = 0;
  936. LoopVectorizationLegality::InductionList::iterator I, E;
  937. LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
  938. for (I = List->begin(), E = List->end(); I != E; ++I) {
  939. PHINode *OrigPhi = I->first;
  940. LoopVectorizationLegality::InductionInfo II = I->second;
  941. PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
  942. MiddleBlock->getTerminator());
  943. Value *EndValue = 0;
  944. switch (II.IK) {
  945. case LoopVectorizationLegality::IK_NoInduction:
  946. llvm_unreachable("Unknown induction");
  947. case LoopVectorizationLegality::IK_IntInduction: {
  948. // Handle the integer induction counter:
  949. assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
  950. assert(OrigPhi == OldInduction && "Unknown integer PHI");
  951. // We know what the end value is.
  952. EndValue = IdxEndRoundDown;
  953. // We also know which PHI node holds it.
  954. ResumeIndex = ResumeVal;
  955. break;
  956. }
  957. case LoopVectorizationLegality::IK_ReverseIntInduction: {
  958. // Convert the CountRoundDown variable to the PHI size.
  959. unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
  960. unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
  961. Value *CRD = CountRoundDown;
  962. if (CRDSize > IISize)
  963. CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
  964. II.StartValue->getType(), "tr.crd",
  965. LoopBypassBlocks.back()->getTerminator());
  966. else if (CRDSize < IISize)
  967. CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
  968. II.StartValue->getType(),
  969. "sext.crd",
  970. LoopBypassBlocks.back()->getTerminator());
  971. // Handle reverse integer induction counter:
  972. EndValue =
  973. BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
  974. LoopBypassBlocks.back()->getTerminator());
  975. break;
  976. }
  977. case LoopVectorizationLegality::IK_PtrInduction: {
  978. // For pointer induction variables, calculate the offset using
  979. // the end index.
  980. EndValue =
  981. GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
  982. LoopBypassBlocks.back()->getTerminator());
  983. break;
  984. }
  985. }// end of case
  986. // The new PHI merges the original incoming value, in case of a bypass,
  987. // or the value at the end of the vectorized loop.
  988. for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
  989. ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
  990. ResumeVal->addIncoming(EndValue, VecBody);
  991. // Fix the scalar body counter (PHI node).
  992. unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
  993. OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
  994. }
  995. // If we are generating a new induction variable then we also need to
  996. // generate the code that calculates the exit value. This value is not
  997. // simply the end of the counter because we may skip the vectorized body
  998. // in case of a runtime check.
  999. if (!OldInduction){
  1000. assert(!ResumeIndex && "Unexpected resume value found");
  1001. ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
  1002. MiddleBlock->getTerminator());
  1003. for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
  1004. ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
  1005. ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
  1006. }
  1007. // Make sure that we found the index where scalar loop needs to continue.
  1008. assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
  1009. "Invalid resume Index");
  1010. // Add a check in the middle block to see if we have completed
  1011. // all of the iterations in the first vector loop.
  1012. // If (N - N%VF) == N, then we *don't* need to run the remainder.
  1013. Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
  1014. ResumeIndex, "cmp.n",
  1015. MiddleBlock->getTerminator());
  1016. BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
  1017. // Remove the old terminator.
  1018. MiddleBlock->getTerminator()->eraseFromParent();
  1019. // Create i+1 and fill the PHINode.
  1020. Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
  1021. Induction->addIncoming(StartIdx, VectorPH);
  1022. Induction->addIncoming(NextIdx, VecBody);
  1023. // Create the compare.
  1024. Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
  1025. Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
  1026. // Now we have two terminators. Remove the old one from the block.
  1027. VecBody->getTerminator()->eraseFromParent();
  1028. // Get ready to start creating new instructions into the vectorized body.
  1029. Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
  1030. // Create and register the new vector loop.
  1031. Loop* Lp = new Loop();
  1032. Loop *ParentLoop = OrigLoop->getParentLoop();
  1033. // Insert the new loop into the loop nest and register the new basic blocks.
  1034. if (ParentLoop) {
  1035. ParentLoop->addChildLoop(Lp);
  1036. for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
  1037. ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
  1038. ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
  1039. ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
  1040. ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
  1041. } else {
  1042. LI->addTopLevelLoop(Lp);
  1043. }
  1044. Lp->addBasicBlockToLoop(VecBody, LI->getBase());
  1045. // Save the state.
  1046. LoopVectorPreHeader = VectorPH;
  1047. LoopScalarPreHeader = ScalarPH;
  1048. LoopMiddleBlock = MiddleBlock;
  1049. LoopExitBlock = ExitBlock;
  1050. LoopVectorBody = VecBody;
  1051. LoopScalarBody = OldBasicBlock;
  1052. }
  1053. /// This function returns the identity element (or neutral element) for
  1054. /// the operation K.
  1055. static Constant*
  1056. getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
  1057. switch (K) {
  1058. case LoopVectorizationLegality:: RK_IntegerXor:
  1059. case LoopVectorizationLegality:: RK_IntegerAdd:
  1060. case LoopVectorizationLegality:: RK_IntegerOr:
  1061. // Adding, Xoring, Oring zero to a number does not change it.
  1062. return ConstantInt::get(Tp, 0);
  1063. case LoopVectorizationLegality:: RK_IntegerMult:
  1064. // Multiplying a number by 1 does not change it.
  1065. return ConstantInt::get(Tp, 1);
  1066. case LoopVectorizationLegality:: RK_IntegerAnd:
  1067. // AND-ing a number with an all-1 value does not change it.
  1068. return ConstantInt::get(Tp, -1, true);
  1069. case LoopVectorizationLegality:: RK_FloatMult:
  1070. // Multiplying a number by 1 does not change it.
  1071. return ConstantFP::get(Tp, 1.0L);
  1072. case LoopVectorizationLegality:: RK_FloatAdd:
  1073. // Adding zero to a number does not change it.
  1074. return ConstantFP::get(Tp, 0.0L);
  1075. default:
  1076. llvm_unreachable("Unknown reduction kind");
  1077. }
  1078. }
  1079. static bool
  1080. isTriviallyVectorizableIntrinsic(Instruction *Inst) {
  1081. IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
  1082. if (!II)
  1083. return false;
  1084. switch (II->getIntrinsicID()) {
  1085. case Intrinsic::sqrt:
  1086. case Intrinsic::sin:
  1087. case Intrinsic::cos:
  1088. case Intrinsic::exp:
  1089. case Intrinsic::exp2:
  1090. case Intrinsic::log:
  1091. case Intrinsic::log10:
  1092. case Intrinsic::log2:
  1093. case Intrinsic::fabs:
  1094. case Intrinsic::floor:
  1095. case Intrinsic::ceil:
  1096. case Intrinsic::trunc:
  1097. case Intrinsic::rint:
  1098. case Intrinsic::nearbyint:
  1099. case Intrinsic::pow:
  1100. case Intrinsic::fma:
  1101. case Intrinsic::fmuladd:
  1102. return true;
  1103. default:
  1104. return false;
  1105. }
  1106. return false;
  1107. }
  1108. /// This function translates the reduction kind to an LLVM binary operator.
  1109. static Instruction::BinaryOps
  1110. getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
  1111. switch (Kind) {
  1112. case LoopVectorizationLegality::RK_IntegerAdd:
  1113. return Instruction::Add;
  1114. case LoopVectorizationLegality::RK_IntegerMult:
  1115. return Instruction::Mul;
  1116. case LoopVectorizationLegality::RK_IntegerOr:
  1117. return Instruction::Or;
  1118. case LoopVectorizationLegality::RK_IntegerAnd:
  1119. return Instruction::And;
  1120. case LoopVectorizationLegality::RK_IntegerXor:
  1121. return Instruction::Xor;
  1122. case LoopVectorizationLegality::RK_FloatMult:
  1123. return Instruction::FMul;
  1124. case LoopVectorizationLegality::RK_FloatAdd:
  1125. return Instruction::FAdd;
  1126. default:
  1127. llvm_unreachable("Unknown reduction operation");
  1128. }
  1129. }
  1130. void
  1131. InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
  1132. //===------------------------------------------------===//
  1133. //
  1134. // Notice: any optimization or new instruction that go
  1135. // into the code below should be also be implemented in
  1136. // the cost-model.
  1137. //
  1138. //===------------------------------------------------===//
  1139. BasicBlock &BB = *OrigLoop->getHeader();
  1140. Constant *Zero =
  1141. ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
  1142. // In order to support reduction variables we need to be able to vectorize
  1143. // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
  1144. // stages. First, we create a new vector PHI node with no incoming edges.
  1145. // We use this value when we vectorize all of the instructions that use the
  1146. // PHI. Next, after all of the instructions in the block are complete we
  1147. // add the new incoming edges to the PHI. At this point all of the
  1148. // instructions in the basic block are vectorized, so we can use them to
  1149. // construct the PHI.
  1150. PhiVector RdxPHIsToFix;
  1151. // Scan the loop in a topological order to ensure that defs are vectorized
  1152. // before users.
  1153. LoopBlocksDFS DFS(OrigLoop);
  1154. DFS.perform(LI);
  1155. // Vectorize all of the blocks in the original loop.
  1156. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
  1157. be = DFS.endRPO(); bb != be; ++bb)
  1158. vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
  1159. // At this point every instruction in the original loop is widened to
  1160. // a vector form. We are almost done. Now, we need to fix the PHI nodes
  1161. // that we vectorized. The PHI nodes are currently empty because we did
  1162. // not want to introduce cycles. Notice that the remaining PHI nodes
  1163. // that we need to fix are reduction variables.
  1164. // Create the 'reduced' values for each of the induction vars.
  1165. // The reduced values are the vector values that we scalarize and combine
  1166. // after the loop is finished.
  1167. for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
  1168. it != e; ++it) {
  1169. PHINode *RdxPhi = *it;
  1170. assert(RdxPhi && "Unable to recover vectorized PHI");
  1171. // Find the reduction variable descriptor.
  1172. assert(Legal->getReductionVars()->count(RdxPhi) &&
  1173. "Unable to find the reduction variable");
  1174. LoopVectorizationLegality::ReductionDescriptor RdxDesc =
  1175. (*Legal->getReductionVars())[RdxPhi];
  1176. // We need to generate a reduction vector from the incoming scalar.
  1177. // To do so, we need to generate the 'identity' vector and overide
  1178. // one of the elements with the incoming scalar reduction. We need
  1179. // to do it in the vector-loop preheader.
  1180. Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
  1181. // This is the vector-clone of the value that leaves the loop.
  1182. VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
  1183. Type *VecTy = VectorExit[0]->getType();
  1184. // Find the reduction identity variable. Zero for addition, or, xor,
  1185. // one for multiplication, -1 for And.
  1186. Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
  1187. Constant *Identity = ConstantVector::getSplat(VF, Iden);
  1188. // This vector is the Identity vector where the first element is the
  1189. // incoming scalar reduction.
  1190. Value *VectorStart = Builder.CreateInsertElement(Identity,
  1191. RdxDesc.StartValue, Zero);
  1192. // Fix the vector-loop phi.
  1193. // We created the induction variable so we know that the
  1194. // preheader is the first entry.
  1195. BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
  1196. // Reductions do not have to start at zero. They can start with
  1197. // any loop invariant values.
  1198. VectorParts &VecRdxPhi = WidenMap.get(RdxPhi);
  1199. BasicBlock *Latch = OrigLoop->getLoopLatch();
  1200. Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
  1201. VectorParts &Val = getVectorValue(LoopVal);
  1202. for (unsigned part = 0; part < UF; ++part) {
  1203. // Make sure to add the reduction stat value only to the
  1204. // first unroll part.
  1205. Value *StartVal = (part == 0) ? VectorStart : Identity;
  1206. cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
  1207. cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
  1208. }
  1209. // Before each round, move the insertion point right between
  1210. // the PHIs and the values we are going to write.
  1211. // This allows us to write both PHINodes and the extractelement
  1212. // instructions.
  1213. Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
  1214. VectorParts RdxParts;
  1215. for (unsigned part = 0; part < UF; ++part) {
  1216. // This PHINode contains the vectorized reduction variable, or
  1217. // the initial value vector, if we bypass the vector loop.
  1218. VectorParts &RdxExitVal = getVectorValue(RdxDesc.LoopExitInstr);
  1219. PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
  1220. Value *StartVal = (part == 0) ? VectorStart : Identity;
  1221. for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
  1222. NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
  1223. NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
  1224. RdxParts.push_back(NewPhi);
  1225. }
  1226. // Reduce all of the unrolled parts into a single vector.
  1227. Value *ReducedPartRdx = RdxParts[0];
  1228. for (unsigned part = 1; part < UF; ++part) {
  1229. Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
  1230. ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
  1231. "bin.rdx");
  1232. }
  1233. // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
  1234. // and vector ops, reducing the set of values being computed by half each
  1235. // round.
  1236. assert(isPowerOf2_32(VF) &&
  1237. "Reduction emission only supported for pow2 vectors!");
  1238. Value *TmpVec = ReducedPartRdx;
  1239. SmallVector<Constant*, 32> ShuffleMask(VF, 0);
  1240. for (unsigned i = VF; i != 1; i >>= 1) {
  1241. // Move the upper half of the vector to the lower half.
  1242. for (unsigned j = 0; j != i/2; ++j)
  1243. ShuffleMask[j] = Builder.getInt32(i/2 + j);
  1244. // Fill the rest of the mask with undef.
  1245. std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
  1246. UndefValue::get(Builder.getInt32Ty()));
  1247. Value *Shuf =
  1248. Builder.CreateShuffleVector(TmpVec,
  1249. UndefValue::get(TmpVec->getType()),
  1250. ConstantVector::get(ShuffleMask),
  1251. "rdx.shuf");
  1252. Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
  1253. TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
  1254. }
  1255. // The result is in the first element of the vector.
  1256. Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  1257. // Now, we need to fix the users of the reduction variable
  1258. // inside and outside of the scalar remainder loop.
  1259. // We know that the loop is in LCSSA form. We need to update the
  1260. // PHI nodes in the exit blocks.
  1261. for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
  1262. LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
  1263. PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
  1264. if (!LCSSAPhi) continue;
  1265. // All PHINodes need to have a single entry edge, or two if
  1266. // we already fixed them.
  1267. assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
  1268. // We found our reduction value exit-PHI. Update it with the
  1269. // incoming bypass edge.
  1270. if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
  1271. // Add an edge coming from the bypass.
  1272. LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
  1273. break;
  1274. }
  1275. }// end of the LCSSA phi scan.
  1276. // Fix the scalar loop reduction variable with the incoming reduction sum
  1277. // from the vector body and from the backedge value.
  1278. int IncomingEdgeBlockIdx =
  1279. (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
  1280. assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
  1281. // Pick the other block.
  1282. int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
  1283. (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
  1284. (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
  1285. }// end of for each redux variable.
  1286. // The Loop exit block may have single value PHI nodes where the incoming
  1287. // value is 'undef'. While vectorizing we only handled real values that
  1288. // were defined inside the loop. Here we handle the 'undef case'.
  1289. // See PR14725.
  1290. for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
  1291. LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
  1292. PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
  1293. if (!LCSSAPhi) continue;
  1294. if (LCSSAPhi->getNumIncomingValues() == 1)
  1295. LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
  1296. LoopMiddleBlock);
  1297. }
  1298. }
  1299. InnerLoopVectorizer::VectorParts
  1300. InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
  1301. assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
  1302. "Invalid edge");
  1303. VectorParts SrcMask = createBlockInMask(Src);
  1304. // The terminator has to be a branch inst!
  1305. BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
  1306. assert(BI && "Unexpected terminator found");
  1307. if (BI->isConditional()) {
  1308. VectorParts EdgeMask = getVectorValue(BI->getCondition());
  1309. if (BI->getSuccessor(0) != Dst)
  1310. for (unsigned part = 0; part < UF; ++part)
  1311. EdgeMask[part] = Builder.CreateNot(EdgeMask[part]);
  1312. for (unsigned part = 0; part < UF; ++part)
  1313. EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
  1314. return EdgeMask;
  1315. }
  1316. return SrcMask;
  1317. }
  1318. InnerLoopVectorizer::VectorParts
  1319. InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
  1320. assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
  1321. // Loop incoming mask is all-one.
  1322. if (OrigLoop->getHeader() == BB) {
  1323. Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
  1324. return getVectorValue(C);
  1325. }
  1326. // This is the block mask. We OR all incoming edges, and with zero.
  1327. Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
  1328. VectorParts BlockMask = getVectorValue(Zero);
  1329. // For each pred:
  1330. for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) {
  1331. VectorParts EM = createEdgeMask(*it, BB);
  1332. for (unsigned part = 0; part < UF; ++part)
  1333. BlockMask[part] = Builder.CreateOr(BlockMask[part], EM[part]);
  1334. }
  1335. return BlockMask;
  1336. }
  1337. void
  1338. InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
  1339. BasicBlock *BB, PhiVector *PV) {
  1340. Constant *Zero = Builder.getInt32(0);
  1341. // For each instruction in the old loop.
  1342. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  1343. VectorParts &Entry = WidenMap.get(it);
  1344. switch (it->getOpcode()) {
  1345. case Instruction::Br:
  1346. // Nothing to do for PHIs and BR, since we already took care of the
  1347. // loop control flow instructions.
  1348. continue;
  1349. case Instruction::PHI:{
  1350. PHINode* P = cast<PHINode>(it);
  1351. // Handle reduction variables:
  1352. if (Legal->getReductionVars()->count(P)) {
  1353. for (unsigned part = 0; part < UF; ++part) {
  1354. // This is phase one of vectorizing PHIs.
  1355. Type *VecTy = VectorType::get(it->getType(), VF);
  1356. Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
  1357. LoopVectorBody-> getFirstInsertionPt());
  1358. }
  1359. PV->push_back(P);
  1360. continue;
  1361. }
  1362. // Check for PHI nodes that are lowered to vector selects.
  1363. if (P->getParent() != OrigLoop->getHeader()) {
  1364. // We know that all PHIs in non header blocks are converted into
  1365. // selects, so we don't have to worry about the insertion order and we
  1366. // can just use the builder.
  1367. // At this point we generate the predication tree. There may be
  1368. // duplications since this is a simple recursive scan, but future
  1369. // optimizations will clean it up.
  1370. VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
  1371. P->getParent());
  1372. for (unsigned part = 0; part < UF; ++part) {
  1373. VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
  1374. VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
  1375. Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
  1376. "predphi");
  1377. }
  1378. continue;
  1379. }
  1380. // This PHINode must be an induction variable.
  1381. // Make sure that we know about it.
  1382. assert(Legal->getInductionVars()->count(P) &&
  1383. "Not an induction variable");
  1384. LoopVectorizationLegality::InductionInfo II =
  1385. Legal->getInductionVars()->lookup(P);
  1386. switch (II.IK) {
  1387. case LoopVectorizationLegality::IK_NoInduction:
  1388. llvm_unreachable("Unknown induction");
  1389. case LoopVectorizationLegality::IK_IntInduction: {
  1390. assert(P == OldInduction && "Unexpected PHI");
  1391. Value *Broadcasted = getBroadcastInstrs(Induction);
  1392. // After broadcasting the induction variable we need to make the
  1393. // vector consecutive by adding 0, 1, 2 ...
  1394. for (unsigned part = 0; part < UF; ++part)
  1395. Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
  1396. continue;
  1397. }
  1398. case LoopVectorizationLegality::IK_ReverseIntInduction:
  1399. case LoopVectorizationLegality::IK_PtrInduction:
  1400. // Handle reverse integer and pointer inductions.
  1401. Value *StartIdx = 0;
  1402. // If we have a single integer induction variable then use it.
  1403. // Otherwise, start counting at zero.
  1404. if (OldInduction) {
  1405. LoopVectorizationLegality::InductionInfo OldII =
  1406. Legal->getInductionVars()->lookup(OldInduction);
  1407. StartIdx = OldII.StartValue;
  1408. } else {
  1409. StartIdx = ConstantInt::get(Induction->getType(), 0);
  1410. }
  1411. // This is the normalized GEP that starts counting at zero.
  1412. Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
  1413. "normalized.idx");
  1414. // Handle the reverse integer induction variable case.
  1415. if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
  1416. IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
  1417. Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
  1418. "resize.norm.idx");
  1419. Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
  1420. "reverse.idx");
  1421. // This is a new value so do not hoist it out.
  1422. Value *Broadcasted = getBroadcastInstrs(ReverseInd);
  1423. // After broadcasting the induction variable we need to make the
  1424. // vector consecutive by adding ... -3, -2, -1, 0.
  1425. for (unsigned part = 0; part < UF; ++part)
  1426. Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
  1427. continue;
  1428. }
  1429. // Handle the pointer induction variable case.
  1430. assert(P->getType()->isPointerTy() && "Unexpected type.");
  1431. // This is the vector of results. Notice that we don't generate
  1432. // vector geps because scalar geps result in better code.
  1433. for (unsigned part = 0; part < UF; ++part) {
  1434. Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
  1435. for (unsigned int i = 0; i < VF; ++i) {
  1436. Constant *Idx = ConstantInt::get(Induction->getType(),
  1437. i + part * VF);
  1438. Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
  1439. "gep.idx");
  1440. Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
  1441. "next.gep");
  1442. VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
  1443. Builder.getInt32(i),
  1444. "insert.gep");
  1445. }
  1446. Entry[part] = VecVal;
  1447. }
  1448. continue;
  1449. }
  1450. }// End of PHI.
  1451. case Instruction::Add:
  1452. case Instruction::FAdd:
  1453. case Instruction::Sub:
  1454. case Instruction::FSub:
  1455. case Instruction::Mul:
  1456. case Instruction::FMul:
  1457. case Instruction::UDiv:
  1458. case Instruction::SDiv:
  1459. case Instruction::FDiv:
  1460. case Instruction::URem:
  1461. case Instruction::SRem:
  1462. case Instruction::FRem:
  1463. case Instruction::Shl:
  1464. case Instruction::LShr:
  1465. case Instruction::AShr:
  1466. case Instruction::And:
  1467. case Instruction::Or:
  1468. case Instruction::Xor: {
  1469. // Just widen binops.
  1470. BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
  1471. VectorParts &A = getVectorValue(it->getOperand(0));
  1472. VectorParts &B = getVectorValue(it->getOperand(1));
  1473. // Use this vector value for all users of the original instruction.
  1474. for (unsigned Part = 0; Part < UF; ++Part) {
  1475. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A[Part], B[Part]);
  1476. // Update the NSW, NUW and Exact flags. Notice: V can be an Undef.
  1477. BinaryOperator *VecOp = dyn_cast<BinaryOperator>(V);
  1478. if (VecOp && isa<OverflowingBinaryOperator>(BinOp)) {
  1479. VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
  1480. VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
  1481. }
  1482. if (VecOp && isa<PossiblyExactOperator>(VecOp))
  1483. VecOp->setIsExact(BinOp->isExact());
  1484. Entry[Part] = V;
  1485. }
  1486. break;
  1487. }
  1488. case Instruction::Select: {
  1489. // Widen selects.
  1490. // If the selector is loop invariant we can create a select
  1491. // instruction with a scalar condition. Otherwise, use vector-select.
  1492. bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
  1493. OrigLoop);
  1494. // The condition can be loop invariant but still defined inside the
  1495. // loop. This means that we can't just use the original 'cond' value.
  1496. // We have to take the 'vectorized' value and pick the first lane.
  1497. // Instcombine will make this a no-op.
  1498. VectorParts &Cond = getVectorValue(it->getOperand(0));
  1499. VectorParts &Op0 = getVectorValue(it->getOperand(1));
  1500. VectorParts &Op1 = getVectorValue(it->getOperand(2));
  1501. Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
  1502. Builder.getInt32(0));
  1503. for (unsigned Part = 0; Part < UF; ++Part) {
  1504. Entry[Part] = Builder.CreateSelect(
  1505. InvariantCond ? ScalarCond : Cond[Part],
  1506. Op0[Part],
  1507. Op1[Part]);
  1508. }
  1509. break;
  1510. }
  1511. case Instruction::ICmp:
  1512. case Instruction::FCmp: {
  1513. // Widen compares. Generate vector compares.
  1514. bool FCmp = (it->getOpcode() == Instruction::FCmp);
  1515. CmpInst *Cmp = dyn_cast<CmpInst>(it);
  1516. VectorParts &A = getVectorValue(it->getOperand(0));
  1517. VectorParts &B = getVectorValue(it->getOperand(1));
  1518. for (unsigned Part = 0; Part < UF; ++Part) {
  1519. Value *C = 0;
  1520. if (FCmp)
  1521. C = Builder.CreateFCmp(Cmp->getPredicate(), A[Part], B[Part]);
  1522. else
  1523. C = Builder.CreateICmp(Cmp->getPredicate(), A[Part], B[Part]);
  1524. Entry[Part] = C;
  1525. }
  1526. break;
  1527. }
  1528. case Instruction::Store: {
  1529. // Attempt to issue a wide store.
  1530. StoreInst *SI = dyn_cast<StoreInst>(it);
  1531. Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
  1532. Value *Ptr = SI->getPointerOperand();
  1533. unsigned Alignment = SI->getAlignment();
  1534. assert(!Legal->isUniform(Ptr) &&
  1535. "We do not allow storing to uniform addresses");
  1536. int Stride = Legal->isConsecutivePtr(Ptr);
  1537. bool Reverse = Stride < 0;
  1538. if (Stride == 0) {
  1539. scalarizeInstruction(it);
  1540. break;
  1541. }
  1542. // Handle consecutive stores.
  1543. GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
  1544. if (Gep) {
  1545. // The last index does not have to be the induction. It can be
  1546. // consecutive and be a function of the index. For example A[I+1];
  1547. unsigned NumOperands = Gep->getNumOperands();
  1548. Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
  1549. VectorParts &GEPParts = getVectorValue(LastGepOperand);
  1550. Value *LastIndex = GEPParts[0];
  1551. LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
  1552. // Create the new GEP with the new induction variable.
  1553. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
  1554. Gep2->setOperand(NumOperands - 1, LastIndex);
  1555. Ptr = Builder.Insert(Gep2);
  1556. } else {
  1557. // Use the induction element ptr.
  1558. assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
  1559. VectorParts &PtrVal = getVectorValue(Ptr);
  1560. Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
  1561. }
  1562. VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
  1563. for (unsigned Part = 0; Part < UF; ++Part) {
  1564. // Calculate the pointer for the specific unroll-part.
  1565. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
  1566. if (Reverse) {
  1567. // If we store to reverse consecutive memory locations then we need
  1568. // to reverse the order of elements in the stored value.
  1569. StoredVal[Part] = reverseVector(StoredVal[Part]);
  1570. // If the address is consecutive but reversed, then the
  1571. // wide store needs to start at the last vector element.
  1572. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
  1573. PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
  1574. }
  1575. Value *VecPtr = Builder.CreateBitCast(PartPtr, StTy->getPointerTo());
  1576. Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
  1577. }
  1578. break;
  1579. }
  1580. case Instruction::Load: {
  1581. // Attempt to issue a wide load.
  1582. LoadInst *LI = dyn_cast<LoadInst>(it);
  1583. Type *RetTy = VectorType::get(LI->getType(), VF);
  1584. Value *Ptr = LI->getPointerOperand();
  1585. unsigned Alignment = LI->getAlignment();
  1586. // If the pointer is loop invariant or if it is non consecutive,
  1587. // scalarize the load.
  1588. int Stride = Legal->isConsecutivePtr(Ptr);
  1589. bool Reverse = Stride < 0;
  1590. if (Legal->isUniform(Ptr) || Stride == 0) {
  1591. scalarizeInstruction(it);
  1592. break;
  1593. }
  1594. GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
  1595. if (Gep) {
  1596. // The last index does not have to be the induction. It can be
  1597. // consecutive and be a function of the index. For example A[I+1];
  1598. unsigned NumOperands = Gep->getNumOperands();
  1599. Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
  1600. VectorParts &GEPParts = getVectorValue(LastGepOperand);
  1601. Value *LastIndex = GEPParts[0];
  1602. LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
  1603. // Create the new GEP with the new induction variable.
  1604. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
  1605. Gep2->setOperand(NumOperands - 1, LastIndex);
  1606. Ptr = Builder.Insert(Gep2);
  1607. } else {
  1608. // Use the induction element ptr.
  1609. assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
  1610. VectorParts &PtrVal = getVectorValue(Ptr);
  1611. Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
  1612. }
  1613. for (unsigned Part = 0; Part < UF; ++Part) {
  1614. // Calculate the pointer for the specific unroll-part.
  1615. Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
  1616. if (Reverse) {
  1617. // If the address is consecutive but reversed, then the
  1618. // wide store needs to start at the last vector element.
  1619. PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
  1620. PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
  1621. }
  1622. Value *VecPtr = Builder.CreateBitCast(PartPtr, RetTy->getPointerTo());
  1623. Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
  1624. cast<LoadInst>(LI)->setAlignment(Alignment);
  1625. Entry[Part] = Reverse ? reverseVector(LI) : LI;
  1626. }
  1627. break;
  1628. }
  1629. case Instruction::ZExt:
  1630. case Instruction::SExt:
  1631. case Instruction::FPToUI:
  1632. case Instruction::FPToSI:
  1633. case Instruction::FPExt:
  1634. case Instruction::PtrToInt:
  1635. case Instruction::IntToPtr:
  1636. case Instruction::SIToFP:
  1637. case Instruction::UIToFP:
  1638. case Instruction::Trunc:
  1639. case Instruction::FPTrunc:
  1640. case Instruction::BitCast: {
  1641. CastInst *CI = dyn_cast<CastInst>(it);
  1642. /// Optimize the special case where the source is the induction
  1643. /// variable. Notice that we can only optimize the 'trunc' case
  1644. /// because: a. FP conversions lose precision, b. sext/zext may wrap,
  1645. /// c. other casts depend on pointer size.
  1646. if (CI->getOperand(0) == OldInduction &&
  1647. it->getOpcode() == Instruction::Trunc) {
  1648. Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
  1649. CI->getType());
  1650. Value *Broadcasted = getBroadcastInstrs(ScalarCast);
  1651. for (unsigned Part = 0; Part < UF; ++Part)
  1652. Entry[Part] = getConsecutiveVector(Broadcasted, VF * Part, false);
  1653. break;
  1654. }
  1655. /// Vectorize casts.
  1656. Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
  1657. VectorParts &A = getVectorValue(it->getOperand(0));
  1658. for (unsigned Part = 0; Part < UF; ++Part)
  1659. Entry[Part] = Builder.CreateCast(CI->getOpcode(), A[Part], DestTy);
  1660. break;
  1661. }
  1662. case Instruction::Call: {
  1663. assert(isTriviallyVectorizableIntrinsic(it));
  1664. Module *M = BB->getParent()->getParent();
  1665. IntrinsicInst *II = cast<IntrinsicInst>(it);
  1666. Intrinsic::ID ID = II->getIntrinsicID();
  1667. for (unsigned Part = 0; Part < UF; ++Part) {
  1668. SmallVector<Value*, 4> Args;
  1669. for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) {
  1670. VectorParts &Arg = getVectorValue(II->getArgOperand(i));
  1671. Args.push_back(Arg[Part]);
  1672. }
  1673. Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
  1674. Function *F = Intrinsic::getDeclaration(M, ID, Tys);
  1675. Entry[Part] = Builder.CreateCall(F, Args);
  1676. }
  1677. break;
  1678. }
  1679. default:
  1680. // All other instructions are unsupported. Scalarize them.
  1681. scalarizeInstruction(it);
  1682. break;
  1683. }// end of switch.
  1684. }// end of for_each instr.
  1685. }
  1686. void InnerLoopVectorizer::updateAnalysis() {
  1687. // Forget the original basic block.
  1688. SE->forgetLoop(OrigLoop);
  1689. // Update the dominator tree information.
  1690. assert(DT->properlyDominates(LoopBypassBlocks.front(), LoopExitBlock) &&
  1691. "Entry does not dominate exit.");
  1692. for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
  1693. DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
  1694. DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
  1695. DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
  1696. DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
  1697. DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
  1698. DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
  1699. DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
  1700. DEBUG(DT->verifyAnalysis());
  1701. }
  1702. bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
  1703. if (!EnableIfConversion)
  1704. return false;
  1705. assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
  1706. std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
  1707. // Collect the blocks that need predication.
  1708. for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
  1709. BasicBlock *BB = LoopBlocks[i];
  1710. // We don't support switch statements inside loops.
  1711. if (!isa<BranchInst>(BB->getTerminator()))
  1712. return false;
  1713. // We must have at most two predecessors because we need to convert
  1714. // all PHIs to selects.
  1715. unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
  1716. if (Preds > 2)
  1717. return false;
  1718. // We must be able to predicate all blocks that need to be predicated.
  1719. if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
  1720. return false;
  1721. }
  1722. // We can if-convert this loop.
  1723. return true;
  1724. }
  1725. bool LoopVectorizationLegality::canVectorize() {
  1726. assert(TheLoop->getLoopPreheader() && "No preheader!!");
  1727. // We can only vectorize innermost loops.
  1728. if (TheLoop->getSubLoopsVector().size())
  1729. return false;
  1730. // We must have a single backedge.
  1731. if (TheLoop->getNumBackEdges() != 1)
  1732. return false;
  1733. // We must have a single exiting block.
  1734. if (!TheLoop->getExitingBlock())
  1735. return false;
  1736. unsigned NumBlocks = TheLoop->getNumBlocks();
  1737. // Check if we can if-convert non single-bb loops.
  1738. if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
  1739. DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
  1740. return false;
  1741. }
  1742. // We need to have a loop header.
  1743. BasicBlock *Latch = TheLoop->getLoopLatch();
  1744. DEBUG(dbgs() << "LV: Found a loop: " <<
  1745. TheLoop->getHeader()->getName() << "\n");
  1746. // ScalarEvolution needs to be able to find the exit count.
  1747. const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
  1748. if (ExitCount == SE->getCouldNotCompute()) {
  1749. DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
  1750. return false;
  1751. }
  1752. // Do not loop-vectorize loops with a tiny trip count.
  1753. unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
  1754. if (TC > 0u && TC < TinyTripCountVectorThreshold) {
  1755. DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
  1756. "This loop is not worth vectorizing.\n");
  1757. return false;
  1758. }
  1759. // Check if we can vectorize the instructions and CFG in this loop.
  1760. if (!canVectorizeInstrs()) {
  1761. DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
  1762. return false;
  1763. }
  1764. // Go over each instruction and look at memory deps.
  1765. if (!canVectorizeMemory()) {
  1766. DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
  1767. return false;
  1768. }
  1769. // Collect all of the variables that remain uniform after vectorization.
  1770. collectLoopUniforms();
  1771. DEBUG(dbgs() << "LV: We can vectorize this loop" <<
  1772. (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
  1773. <<"!\n");
  1774. // Okay! We can vectorize. At this point we don't have any other mem analysis
  1775. // which may limit our maximum vectorization factor, so just return true with
  1776. // no restrictions.
  1777. return true;
  1778. }
  1779. bool LoopVectorizationLegality::canVectorizeInstrs() {
  1780. BasicBlock *PreHeader = TheLoop->getLoopPreheader();
  1781. BasicBlock *Header = TheLoop->getHeader();
  1782. // For each block in the loop.
  1783. for (Loop::block_iterator bb = TheLoop->block_begin(),
  1784. be = TheLoop->block_end(); bb != be; ++bb) {
  1785. // Scan the instructions in the block and look for hazards.
  1786. for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
  1787. ++it) {
  1788. if (PHINode *Phi = dyn_cast<PHINode>(it)) {
  1789. // This should not happen because the loop should be normalized.
  1790. if (Phi->getNumIncomingValues() != 2) {
  1791. DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
  1792. return false;
  1793. }
  1794. // Check that this PHI type is allowed.
  1795. if (!Phi->getType()->isIntegerTy() &&
  1796. !Phi->getType()->isFloatingPointTy() &&
  1797. !Phi->getType()->isPointerTy()) {
  1798. DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
  1799. return false;
  1800. }
  1801. // If this PHINode is not in the header block, then we know that we
  1802. // can convert it to select during if-conversion. No need to check if
  1803. // the PHIs in this block are induction or reduction variables.
  1804. if (*bb != Header)
  1805. continue;
  1806. // This is the value coming from the preheader.
  1807. Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
  1808. // Check if this is an induction variable.
  1809. InductionKind IK = isInductionVariable(Phi);
  1810. if (IK_NoInduction != IK) {
  1811. // Int inductions are special because we only allow one IV.
  1812. if (IK == IK_IntInduction) {
  1813. if (Induction) {
  1814. DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
  1815. return false;
  1816. }
  1817. Induction = Phi;
  1818. }
  1819. DEBUG(dbgs() << "LV: Found an induction variable.\n");
  1820. Inductions[Phi] = InductionInfo(StartValue, IK);
  1821. continue;
  1822. }
  1823. if (AddReductionVar(Phi, RK_IntegerAdd)) {
  1824. DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
  1825. continue;
  1826. }
  1827. if (AddReductionVar(Phi, RK_IntegerMult)) {
  1828. DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
  1829. continue;
  1830. }
  1831. if (AddReductionVar(Phi, RK_IntegerOr)) {
  1832. DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
  1833. continue;
  1834. }
  1835. if (AddReductionVar(Phi, RK_IntegerAnd)) {
  1836. DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
  1837. continue;
  1838. }
  1839. if (AddReductionVar(Phi, RK_IntegerXor)) {
  1840. DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
  1841. continue;
  1842. }
  1843. if (AddReductionVar(Phi, RK_FloatMult)) {
  1844. DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
  1845. continue;
  1846. }
  1847. if (AddReductionVar(Phi, RK_FloatAdd)) {
  1848. DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
  1849. continue;
  1850. }
  1851. DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
  1852. return false;
  1853. }// end of PHI handling
  1854. // We still don't handle functions.
  1855. CallInst *CI = dyn_cast<CallInst>(it);
  1856. if (CI && !isTriviallyVectorizableIntrinsic(it)) {
  1857. DEBUG(dbgs() << "LV: Found a call site.\n");
  1858. return false;
  1859. }
  1860. // Check that the instruction return type is vectorizable.
  1861. if (!VectorType::isValidElementType(it->getType()) &&
  1862. !it->getType()->isVoidTy()) {
  1863. DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
  1864. return false;
  1865. }
  1866. // Check that the stored type is vectorizable.
  1867. if (StoreInst *ST = dyn_cast<StoreInst>(it)) {
  1868. Type *T = ST->getValueOperand()->getType();
  1869. if (!VectorType::isValidElementType(T))
  1870. return false;
  1871. }
  1872. // Reduction instructions are allowed to have exit users.
  1873. // All other instructions must not have external users.
  1874. if (!AllowedExit.count(it))
  1875. //Check that all of the users of the loop are inside the BB.
  1876. for (Value::use_iterator I = it->use_begin(), E = it->use_end();
  1877. I != E; ++I) {
  1878. Instruction *U = cast<Instruction>(*I);
  1879. // This user may be a reduction exit value.
  1880. if (!TheLoop->contains(U)) {
  1881. DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
  1882. return false;
  1883. }
  1884. }
  1885. } // next instr.
  1886. }
  1887. if (!Induction) {
  1888. DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
  1889. assert(getInductionVars()->size() && "No induction variables");
  1890. }
  1891. return true;
  1892. }
  1893. void LoopVectorizationLegality::collectLoopUniforms() {
  1894. // We now know that the loop is vectorizable!
  1895. // Collect variables that will remain uniform after vectorization.
  1896. std::vector<Value*> Worklist;
  1897. BasicBlock *Latch = TheLoop->getLoopLatch();
  1898. // Start with the conditional branch and walk up the block.
  1899. Worklist.push_back(Latch->getTerminator()->getOperand(0));
  1900. while (Worklist.size()) {
  1901. Instruction *I = dyn_cast<Instruction>(Worklist.back());
  1902. Worklist.pop_back();
  1903. // Look at instructions inside this loop.
  1904. // Stop when reaching PHI nodes.
  1905. // TODO: we need to follow values all over the loop, not only in this block.
  1906. if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
  1907. continue;
  1908. // This is a known uniform.
  1909. Uniforms.insert(I);
  1910. // Insert all operands.
  1911. for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
  1912. Worklist.push_back(I->getOperand(i));
  1913. }
  1914. }
  1915. }
  1916. bool LoopVectorizationLegality::canVectorizeMemory() {
  1917. typedef SmallVector<Value*, 16> ValueVector;
  1918. typedef SmallPtrSet<Value*, 16> ValueSet;
  1919. // Holds the Load and Store *instructions*.
  1920. ValueVector Loads;
  1921. ValueVector Stores;
  1922. PtrRtCheck.Pointers.clear();
  1923. PtrRtCheck.Need = false;
  1924. // For each block.
  1925. for (Loop::block_iterator bb = TheLoop->block_begin(),
  1926. be = TheLoop->block_end(); bb != be; ++bb) {
  1927. // Scan the BB and collect legal loads and stores.
  1928. for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
  1929. ++it) {
  1930. // If this is a load, save it. If this instruction can read from memory
  1931. // but is not a load, then we quit. Notice that we don't handle function
  1932. // calls that read or write.
  1933. if (it->mayReadFromMemory()) {
  1934. LoadInst *Ld = dyn_cast<LoadInst>(it);
  1935. if (!Ld) return false;
  1936. if (!Ld->isSimple()) {
  1937. DEBUG(dbgs() << "LV: Found a non-simple load.\n");
  1938. return false;
  1939. }
  1940. Loads.push_back(Ld);
  1941. continue;
  1942. }
  1943. // Save 'store' instructions. Abort if other instructions write to memory.
  1944. if (it->mayWriteToMemory()) {
  1945. StoreInst *St = dyn_cast<StoreInst>(it);
  1946. if (!St) return false;
  1947. if (!St->isSimple()) {
  1948. DEBUG(dbgs() << "LV: Found a non-simple store.\n");
  1949. return false;
  1950. }
  1951. Stores.push_back(St);
  1952. }
  1953. } // next instr.
  1954. } // next block.
  1955. // Now we have two lists that hold the loads and the stores.
  1956. // Next, we find the pointers that they use.
  1957. // Check if we see any stores. If there are no stores, then we don't
  1958. // care if the pointers are *restrict*.
  1959. if (!Stores.size()) {
  1960. DEBUG(dbgs() << "LV: Found a read-only loop!\n");
  1961. return true;
  1962. }
  1963. // Holds the read and read-write *pointers* that we find.
  1964. ValueVector Reads;
  1965. ValueVector ReadWrites;
  1966. // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
  1967. // multiple times on the same object. If the ptr is accessed twice, once
  1968. // for read and once for write, it will only appear once (on the write
  1969. // list). This is okay, since we are going to check for conflicts between
  1970. // writes and between reads and writes, but not between reads and reads.
  1971. ValueSet Seen;
  1972. ValueVector::iterator I, IE;
  1973. for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
  1974. StoreInst *ST = cast<StoreInst>(*I);
  1975. Value* Ptr = ST->getPointerOperand();
  1976. if (isUniform(Ptr)) {
  1977. DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
  1978. return false;
  1979. }
  1980. // If we did *not* see this pointer before, insert it to
  1981. // the read-write list. At this phase it is only a 'write' list.
  1982. if (Seen.insert(Ptr))
  1983. ReadWrites.push_back(Ptr);
  1984. }
  1985. for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
  1986. LoadInst *LD = cast<LoadInst>(*I);
  1987. Value* Ptr = LD->getPointerOperand();
  1988. // If we did *not* see this pointer before, insert it to the
  1989. // read list. If we *did* see it before, then it is already in
  1990. // the read-write list. This allows us to vectorize expressions
  1991. // such as A[i] += x; Because the address of A[i] is a read-write
  1992. // pointer. This only works if the index of A[i] is consecutive.
  1993. // If the address of i is unknown (for example A[B[i]]) then we may
  1994. // read a few words, modify, and write a few words, and some of the
  1995. // words may be written to the same address.
  1996. if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
  1997. Reads.push_back(Ptr);
  1998. }
  1999. // If we write (or read-write) to a single destination and there are no
  2000. // other reads in this loop then is it safe to vectorize.
  2001. if (ReadWrites.size() == 1 && Reads.size() == 0) {
  2002. DEBUG(dbgs() << "LV: Found a write-only loop!\n");
  2003. return true;
  2004. }
  2005. // Find pointers with computable bounds. We are going to use this information
  2006. // to place a runtime bound check.
  2007. bool CanDoRT = true;
  2008. for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
  2009. if (hasComputableBounds(*I)) {
  2010. PtrRtCheck.insert(SE, TheLoop, *I);
  2011. DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
  2012. } else {
  2013. CanDoRT = false;
  2014. break;
  2015. }
  2016. for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
  2017. if (hasComputableBounds(*I)) {
  2018. PtrRtCheck.insert(SE, TheLoop, *I);
  2019. DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
  2020. } else {
  2021. CanDoRT = false;
  2022. break;
  2023. }
  2024. // Check that we did not collect too many pointers or found a
  2025. // unsizeable pointer.
  2026. if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
  2027. PtrRtCheck.reset();
  2028. CanDoRT = false;
  2029. }
  2030. if (CanDoRT) {
  2031. DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
  2032. }
  2033. bool NeedRTCheck = false;
  2034. // Now that the pointers are in two lists (Reads and ReadWrites), we
  2035. // can check that there are no conflicts between each of the writes and
  2036. // between the writes to the reads.
  2037. ValueSet WriteObjects;
  2038. ValueVector TempObjects;
  2039. // Check that the read-writes do not conflict with other read-write
  2040. // pointers.
  2041. bool AllWritesIdentified = true;
  2042. for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
  2043. GetUnderlyingObjects(*I, TempObjects, DL);
  2044. for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
  2045. it != e; ++it) {
  2046. if (!isIdentifiedObject(*it)) {
  2047. DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
  2048. NeedRTCheck = true;
  2049. AllWritesIdentified = false;
  2050. }
  2051. if (!WriteObjects.insert(*it)) {
  2052. DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
  2053. << **it <<"\n");
  2054. return false;
  2055. }
  2056. }
  2057. TempObjects.clear();
  2058. }
  2059. /// Check that the reads don't conflict with the read-writes.
  2060. for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
  2061. GetUnderlyingObjects(*I, TempObjects, DL);
  2062. for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
  2063. it != e; ++it) {
  2064. // If all of the writes are identified then we don't care if the read
  2065. // pointer is identified or not.
  2066. if (!AllWritesIdentified && !isIdentifiedObject(*it)) {
  2067. DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
  2068. NeedRTCheck = true;
  2069. }
  2070. if (WriteObjects.count(*it)) {
  2071. DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
  2072. << **it <<"\n");
  2073. return false;
  2074. }
  2075. }
  2076. TempObjects.clear();
  2077. }
  2078. PtrRtCheck.Need = NeedRTCheck;
  2079. if (NeedRTCheck && !CanDoRT) {
  2080. DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
  2081. "the array bounds.\n");
  2082. PtrRtCheck.reset();
  2083. return false;
  2084. }
  2085. DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
  2086. " need a runtime memory check.\n");
  2087. return true;
  2088. }
  2089. bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
  2090. ReductionKind Kind) {
  2091. if (Phi->getNumIncomingValues() != 2)
  2092. return false;
  2093. // Reduction variables are only found in the loop header block.
  2094. if (Phi->getParent() != TheLoop->getHeader())
  2095. return false;
  2096. // Obtain the reduction start value from the value that comes from the loop
  2097. // preheader.
  2098. Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
  2099. // ExitInstruction is the single value which is used outside the loop.
  2100. // We only allow for a single reduction value to be used outside the loop.
  2101. // This includes users of the reduction, variables (which form a cycle
  2102. // which ends in the phi node).
  2103. Instruction *ExitInstruction = 0;
  2104. // Indicates that we found a binary operation in our scan.
  2105. bool FoundBinOp = false;
  2106. // Iter is our iterator. We start with the PHI node and scan for all of the
  2107. // users of this instruction. All users must be instructions that can be
  2108. // used as reduction variables (such as ADD). We may have a single
  2109. // out-of-block user. The cycle must end with the original PHI.
  2110. Instruction *Iter = Phi;
  2111. while (true) {
  2112. // If the instruction has no users then this is a broken
  2113. // chain and can't be a reduction variable.
  2114. if (Iter->use_empty())
  2115. return false;
  2116. // Did we find a user inside this loop already ?
  2117. bool FoundInBlockUser = false;
  2118. // Did we reach the initial PHI node already ?
  2119. bool FoundStartPHI = false;
  2120. // Is this a bin op ?
  2121. FoundBinOp |= !isa<PHINode>(Iter);
  2122. // For each of the *users* of iter.
  2123. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
  2124. it != e; ++it) {
  2125. Instruction *U = cast<Instruction>(*it);
  2126. // We already know that the PHI is a user.
  2127. if (U == Phi) {
  2128. FoundStartPHI = true;
  2129. continue;
  2130. }
  2131. // Check if we found the exit user.
  2132. BasicBlock *Parent = U->getParent();
  2133. if (!TheLoop->contains(Parent)) {
  2134. // Exit if you find multiple outside users.
  2135. if (ExitInstruction != 0)
  2136. return false;
  2137. ExitInstruction = Iter;
  2138. }
  2139. // We allow in-loop PHINodes which are not the original reduction PHI
  2140. // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
  2141. // structure) then don't skip this PHI.
  2142. if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
  2143. U->getParent() != TheLoop->getHeader() &&
  2144. TheLoop->contains(U) &&
  2145. Iter->getNumUses() > 1)
  2146. continue;
  2147. // We can't have multiple inside users.
  2148. if (FoundInBlockUser)
  2149. return false;
  2150. FoundInBlockUser = true;
  2151. // Any reduction instr must be of one of the allowed kinds.
  2152. if (!isReductionInstr(U, Kind))
  2153. return false;
  2154. // Reductions of instructions such as Div, and Sub is only
  2155. // possible if the LHS is the reduction variable.
  2156. if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
  2157. return false;
  2158. Iter = U;
  2159. }
  2160. // We found a reduction var if we have reached the original
  2161. // phi node and we only have a single instruction with out-of-loop
  2162. // users.
  2163. if (FoundStartPHI) {
  2164. // This instruction is allowed to have out-of-loop users.
  2165. AllowedExit.insert(ExitInstruction);
  2166. // Save the description of this reduction variable.
  2167. ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
  2168. Reductions[Phi] = RD;
  2169. // We've ended the cycle. This is a reduction variable if we have an
  2170. // outside user and it has a binary op.
  2171. return FoundBinOp && ExitInstruction;
  2172. }
  2173. }
  2174. }
  2175. bool
  2176. LoopVectorizationLegality::isReductionInstr(Instruction *I,
  2177. ReductionKind Kind) {
  2178. bool FP = I->getType()->isFloatingPointTy();
  2179. bool FastMath = (FP && I->isCommutative() && I->isAssociative());
  2180. switch (I->getOpcode()) {
  2181. default:
  2182. return false;
  2183. case Instruction::PHI:
  2184. if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
  2185. return false;
  2186. // possibly.
  2187. return true;
  2188. case Instruction::Sub:
  2189. case Instruction::Add:
  2190. return Kind == RK_IntegerAdd;
  2191. case Instruction::SDiv:
  2192. case Instruction::UDiv:
  2193. case Instruction::Mul:
  2194. return Kind == RK_IntegerMult;
  2195. case Instruction::And:
  2196. return Kind == RK_IntegerAnd;
  2197. case Instruction::Or:
  2198. return Kind == RK_IntegerOr;
  2199. case Instruction::Xor:
  2200. return Kind == RK_IntegerXor;
  2201. case Instruction::FMul:
  2202. return Kind == RK_FloatMult && FastMath;
  2203. case Instruction::FAdd:
  2204. return Kind == RK_FloatAdd && FastMath;
  2205. }
  2206. }
  2207. LoopVectorizationLegality::InductionKind
  2208. LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
  2209. Type *PhiTy = Phi->getType();
  2210. // We only handle integer and pointer inductions variables.
  2211. if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
  2212. return IK_NoInduction;
  2213. // Check that the PHI is consecutive and starts at zero.
  2214. const SCEV *PhiScev = SE->getSCEV(Phi);
  2215. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
  2216. if (!AR) {
  2217. DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
  2218. return IK_NoInduction;
  2219. }
  2220. const SCEV *Step = AR->getStepRecurrence(*SE);
  2221. // Integer inductions need to have a stride of one.
  2222. if (PhiTy->isIntegerTy()) {
  2223. if (Step->isOne())
  2224. return IK_IntInduction;
  2225. if (Step->isAllOnesValue())
  2226. return IK_ReverseIntInduction;
  2227. return IK_NoInduction;
  2228. }
  2229. // Calculate the pointer stride and check if it is consecutive.
  2230. const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
  2231. if (!C)
  2232. return IK_NoInduction;
  2233. assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
  2234. uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
  2235. if (C->getValue()->equalsInt(Size))
  2236. return IK_PtrInduction;
  2237. return IK_NoInduction;
  2238. }
  2239. bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
  2240. Value *In0 = const_cast<Value*>(V);
  2241. PHINode *PN = dyn_cast_or_null<PHINode>(In0);
  2242. if (!PN)
  2243. return false;
  2244. return Inductions.count(PN);
  2245. }
  2246. bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
  2247. assert(TheLoop->contains(BB) && "Unknown block used");
  2248. // Blocks that do not dominate the latch need predication.
  2249. BasicBlock* Latch = TheLoop->getLoopLatch();
  2250. return !DT->dominates(BB, Latch);
  2251. }
  2252. bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
  2253. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  2254. // We don't predicate loads/stores at the moment.
  2255. if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
  2256. return false;
  2257. // The instructions below can trap.
  2258. switch (it->getOpcode()) {
  2259. default: continue;
  2260. case Instruction::UDiv:
  2261. case Instruction::SDiv:
  2262. case Instruction::URem:
  2263. case Instruction::SRem:
  2264. return false;
  2265. }
  2266. }
  2267. return true;
  2268. }
  2269. bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
  2270. const SCEV *PhiScev = SE->getSCEV(Ptr);
  2271. const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
  2272. if (!AR)
  2273. return false;
  2274. return AR->isAffine();
  2275. }
  2276. std::pair<unsigned, unsigned>
  2277. LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
  2278. unsigned UserVF) {
  2279. if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
  2280. DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
  2281. return std::make_pair(1U, 0U);
  2282. }
  2283. // Find the trip count.
  2284. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
  2285. DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
  2286. unsigned WidestType = getWidestType();
  2287. unsigned WidestRegister = TTI.getRegisterBitWidth(true);
  2288. unsigned MaxVectorSize = WidestRegister / WidestType;
  2289. DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
  2290. DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
  2291. if (MaxVectorSize == 0) {
  2292. DEBUG(dbgs() << "LV: The target has no vector registers.\n");
  2293. MaxVectorSize = 1;
  2294. }
  2295. assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
  2296. " into one vector!");
  2297. unsigned VF = MaxVectorSize;
  2298. // If we optimize the program for size, avoid creating the tail loop.
  2299. if (OptForSize) {
  2300. // If we are unable to calculate the trip count then don't try to vectorize.
  2301. if (TC < 2) {
  2302. DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
  2303. return std::make_pair(1U, 0U);
  2304. }
  2305. // Find the maximum SIMD width that can fit within the trip count.
  2306. VF = TC % MaxVectorSize;
  2307. if (VF == 0)
  2308. VF = MaxVectorSize;
  2309. // If the trip count that we found modulo the vectorization factor is not
  2310. // zero then we require a tail.
  2311. if (VF < 2) {
  2312. DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
  2313. return std::make_pair(1U, 0U);
  2314. }
  2315. }
  2316. if (UserVF != 0) {
  2317. assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
  2318. DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
  2319. return std::make_pair(UserVF, 0U);
  2320. }
  2321. float Cost = expectedCost(1);
  2322. unsigned Width = 1;
  2323. DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
  2324. for (unsigned i=2; i <= VF; i*=2) {
  2325. // Notice that the vector loop needs to be executed less times, so
  2326. // we need to divide the cost of the vector loops by the width of
  2327. // the vector elements.
  2328. float VectorCost = expectedCost(i) / (float)i;
  2329. DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
  2330. (int)VectorCost << ".\n");
  2331. if (VectorCost < Cost) {
  2332. Cost = VectorCost;
  2333. Width = i;
  2334. }
  2335. }
  2336. DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
  2337. return std::make_pair<unsigned, unsigned>(Width, VF * Cost);
  2338. }
  2339. unsigned LoopVectorizationCostModel::getWidestType() {
  2340. unsigned MaxWidth = 8;
  2341. // For each block.
  2342. for (Loop::block_iterator bb = TheLoop->block_begin(),
  2343. be = TheLoop->block_end(); bb != be; ++bb) {
  2344. BasicBlock *BB = *bb;
  2345. // For each instruction in the loop.
  2346. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  2347. Type *T = it->getType();
  2348. // Only examine Loads, Stores and PHINodes.
  2349. if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
  2350. continue;
  2351. // Examine PHI nodes that are reduction variables.
  2352. if (PHINode *PN = dyn_cast<PHINode>(it))
  2353. if (!Legal->getReductionVars()->count(PN))
  2354. continue;
  2355. // Examine the stored values.
  2356. if (StoreInst *ST = dyn_cast<StoreInst>(it))
  2357. T = ST->getValueOperand()->getType();
  2358. // Ignore stored/loaded pointer types.
  2359. if (T->isPointerTy())
  2360. continue;
  2361. MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits());
  2362. }
  2363. }
  2364. return MaxWidth;
  2365. }
  2366. unsigned
  2367. LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
  2368. unsigned UserUF,
  2369. unsigned VF,
  2370. unsigned LoopCost) {
  2371. // -- The unroll heuristics --
  2372. // We unroll the loop in order to expose ILP and reduce the loop overhead.
  2373. // There are many micro-architectural considerations that we can't predict
  2374. // at this level. For example frontend pressure (on decode or fetch) due to
  2375. // code size, or the number and capabilities of the execution ports.
  2376. //
  2377. // We use the following heuristics to select the unroll factor:
  2378. // 1. If the code has reductions the we unroll in order to break the cross
  2379. // iteration dependency.
  2380. // 2. If the loop is really small then we unroll in order to reduce the loop
  2381. // overhead.
  2382. // 3. We don't unroll if we think that we will spill registers to memory due
  2383. // to the increased register pressure.
  2384. // Use the user preference, unless 'auto' is selected.
  2385. if (UserUF != 0)
  2386. return UserUF;
  2387. // When we optimize for size we don't unroll.
  2388. if (OptForSize)
  2389. return 1;
  2390. // Do not unroll loops with a relatively small trip count.
  2391. unsigned TC = SE->getSmallConstantTripCount(TheLoop,
  2392. TheLoop->getLoopLatch());
  2393. if (TC > 1 && TC < TinyTripCountUnrollThreshold)
  2394. return 1;
  2395. unsigned TargetVectorRegisters = TTI.getNumberOfRegisters(true);
  2396. DEBUG(dbgs() << "LV: The target has " << TargetVectorRegisters <<
  2397. " vector registers\n");
  2398. LoopVectorizationCostModel::RegisterUsage R = calculateRegisterUsage();
  2399. // We divide by these constants so assume that we have at least one
  2400. // instruction that uses at least one register.
  2401. R.MaxLocalUsers = std::max(R.MaxLocalUsers, 1U);
  2402. R.NumInstructions = std::max(R.NumInstructions, 1U);
  2403. // We calculate the unroll factor using the following formula.
  2404. // Subtract the number of loop invariants from the number of available
  2405. // registers. These registers are used by all of the unrolled instances.
  2406. // Next, divide the remaining registers by the number of registers that is
  2407. // required by the loop, in order to estimate how many parallel instances
  2408. // fit without causing spills.
  2409. unsigned UF = (TargetVectorRegisters - R.LoopInvariantRegs) / R.MaxLocalUsers;
  2410. // Clamp the unroll factor ranges to reasonable factors.
  2411. unsigned MaxUnrollSize = TTI.getMaximumUnrollFactor();
  2412. // If we did not calculate the cost for VF (because the user selected the VF)
  2413. // then we calculate the cost of VF here.
  2414. if (LoopCost == 0)
  2415. LoopCost = expectedCost(VF);
  2416. // Clamp the calculated UF to be between the 1 and the max unroll factor
  2417. // that the target allows.
  2418. if (UF > MaxUnrollSize)
  2419. UF = MaxUnrollSize;
  2420. else if (UF < 1)
  2421. UF = 1;
  2422. if (Legal->getReductionVars()->size()) {
  2423. DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
  2424. return UF;
  2425. }
  2426. // We want to unroll tiny loops in order to reduce the loop overhead.
  2427. // We assume that the cost overhead is 1 and we use the cost model
  2428. // to estimate the cost of the loop and unroll until the cost of the
  2429. // loop overhead is about 5% of the cost of the loop.
  2430. DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
  2431. if (LoopCost < 20) {
  2432. DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
  2433. unsigned NewUF = 20/LoopCost + 1;
  2434. return std::min(NewUF, UF);
  2435. }
  2436. DEBUG(dbgs() << "LV: Not Unrolling. \n");
  2437. return 1;
  2438. }
  2439. LoopVectorizationCostModel::RegisterUsage
  2440. LoopVectorizationCostModel::calculateRegisterUsage() {
  2441. // This function calculates the register usage by measuring the highest number
  2442. // of values that are alive at a single location. Obviously, this is a very
  2443. // rough estimation. We scan the loop in a topological order in order and
  2444. // assign a number to each instruction. We use RPO to ensure that defs are
  2445. // met before their users. We assume that each instruction that has in-loop
  2446. // users starts an interval. We record every time that an in-loop value is
  2447. // used, so we have a list of the first and last occurrences of each
  2448. // instruction. Next, we transpose this data structure into a multi map that
  2449. // holds the list of intervals that *end* at a specific location. This multi
  2450. // map allows us to perform a linear search. We scan the instructions linearly
  2451. // and record each time that a new interval starts, by placing it in a set.
  2452. // If we find this value in the multi-map then we remove it from the set.
  2453. // The max register usage is the maximum size of the set.
  2454. // We also search for instructions that are defined outside the loop, but are
  2455. // used inside the loop. We need this number separately from the max-interval
  2456. // usage number because when we unroll, loop-invariant values do not take
  2457. // more register.
  2458. LoopBlocksDFS DFS(TheLoop);
  2459. DFS.perform(LI);
  2460. RegisterUsage R;
  2461. R.NumInstructions = 0;
  2462. // Each 'key' in the map opens a new interval. The values
  2463. // of the map are the index of the 'last seen' usage of the
  2464. // instruction that is the key.
  2465. typedef DenseMap<Instruction*, unsigned> IntervalMap;
  2466. // Maps instruction to its index.
  2467. DenseMap<unsigned, Instruction*> IdxToInstr;
  2468. // Marks the end of each interval.
  2469. IntervalMap EndPoint;
  2470. // Saves the list of instruction indices that are used in the loop.
  2471. SmallSet<Instruction*, 8> Ends;
  2472. // Saves the list of values that are used in the loop but are
  2473. // defined outside the loop, such as arguments and constants.
  2474. SmallPtrSet<Value*, 8> LoopInvariants;
  2475. unsigned Index = 0;
  2476. for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
  2477. be = DFS.endRPO(); bb != be; ++bb) {
  2478. R.NumInstructions += (*bb)->size();
  2479. for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
  2480. ++it) {
  2481. Instruction *I = it;
  2482. IdxToInstr[Index++] = I;
  2483. // Save the end location of each USE.
  2484. for (unsigned i = 0; i < I->getNumOperands(); ++i) {
  2485. Value *U = I->getOperand(i);
  2486. Instruction *Instr = dyn_cast<Instruction>(U);
  2487. // Ignore non-instruction values such as arguments, constants, etc.
  2488. if (!Instr) continue;
  2489. // If this instruction is outside the loop then record it and continue.
  2490. if (!TheLoop->contains(Instr)) {
  2491. LoopInvariants.insert(Instr);
  2492. continue;
  2493. }
  2494. // Overwrite previous end points.
  2495. EndPoint[Instr] = Index;
  2496. Ends.insert(Instr);
  2497. }
  2498. }
  2499. }
  2500. // Saves the list of intervals that end with the index in 'key'.
  2501. typedef SmallVector<Instruction*, 2> InstrList;
  2502. DenseMap<unsigned, InstrList> TransposeEnds;
  2503. // Transpose the EndPoints to a list of values that end at each index.
  2504. for (IntervalMap::iterator it = EndPoint.begin(), e = EndPoint.end();
  2505. it != e; ++it)
  2506. TransposeEnds[it->second].push_back(it->first);
  2507. SmallSet<Instruction*, 8> OpenIntervals;
  2508. unsigned MaxUsage = 0;
  2509. DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
  2510. for (unsigned int i = 0; i < Index; ++i) {
  2511. Instruction *I = IdxToInstr[i];
  2512. // Ignore instructions that are never used within the loop.
  2513. if (!Ends.count(I)) continue;
  2514. // Remove all of the instructions that end at this location.
  2515. InstrList &List = TransposeEnds[i];
  2516. for (unsigned int j=0, e = List.size(); j < e; ++j)
  2517. OpenIntervals.erase(List[j]);
  2518. // Count the number of live interals.
  2519. MaxUsage = std::max(MaxUsage, OpenIntervals.size());
  2520. DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
  2521. OpenIntervals.size() <<"\n");
  2522. // Add the current instruction to the list of open intervals.
  2523. OpenIntervals.insert(I);
  2524. }
  2525. unsigned Invariant = LoopInvariants.size();
  2526. DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
  2527. DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
  2528. DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
  2529. R.LoopInvariantRegs = Invariant;
  2530. R.MaxLocalUsers = MaxUsage;
  2531. return R;
  2532. }
  2533. unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
  2534. unsigned Cost = 0;
  2535. // For each block.
  2536. for (Loop::block_iterator bb = TheLoop->block_begin(),
  2537. be = TheLoop->block_end(); bb != be; ++bb) {
  2538. unsigned BlockCost = 0;
  2539. BasicBlock *BB = *bb;
  2540. // For each instruction in the old loop.
  2541. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  2542. unsigned C = getInstructionCost(it, VF);
  2543. Cost += C;
  2544. DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
  2545. VF << " For instruction: "<< *it << "\n");
  2546. }
  2547. // We assume that if-converted blocks have a 50% chance of being executed.
  2548. // When the code is scalar then some of the blocks are avoided due to CF.
  2549. // When the code is vectorized we execute all code paths.
  2550. if (Legal->blockNeedsPredication(*bb) && VF == 1)
  2551. BlockCost /= 2;
  2552. Cost += BlockCost;
  2553. }
  2554. return Cost;
  2555. }
  2556. unsigned
  2557. LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
  2558. // If we know that this instruction will remain uniform, check the cost of
  2559. // the scalar version.
  2560. if (Legal->isUniformAfterVectorization(I))
  2561. VF = 1;
  2562. Type *RetTy = I->getType();
  2563. Type *VectorTy = ToVectorTy(RetTy, VF);
  2564. // TODO: We need to estimate the cost of intrinsic calls.
  2565. switch (I->getOpcode()) {
  2566. case Instruction::GetElementPtr:
  2567. // We mark this instruction as zero-cost because scalar GEPs are usually
  2568. // lowered to the intruction addressing mode. At the moment we don't
  2569. // generate vector geps.
  2570. return 0;
  2571. case Instruction::Br: {
  2572. return TTI.getCFInstrCost(I->getOpcode());
  2573. }
  2574. case Instruction::PHI:
  2575. //TODO: IF-converted IFs become selects.
  2576. return 0;
  2577. case Instruction::Add:
  2578. case Instruction::FAdd:
  2579. case Instruction::Sub:
  2580. case Instruction::FSub:
  2581. case Instruction::Mul:
  2582. case Instruction::FMul:
  2583. case Instruction::UDiv:
  2584. case Instruction::SDiv:
  2585. case Instruction::FDiv:
  2586. case Instruction::URem:
  2587. case Instruction::SRem:
  2588. case Instruction::FRem:
  2589. case Instruction::Shl:
  2590. case Instruction::LShr:
  2591. case Instruction::AShr:
  2592. case Instruction::And:
  2593. case Instruction::Or:
  2594. case Instruction::Xor:
  2595. return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
  2596. case Instruction::Select: {
  2597. SelectInst *SI = cast<SelectInst>(I);
  2598. const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
  2599. bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
  2600. Type *CondTy = SI->getCondition()->getType();
  2601. if (ScalarCond)
  2602. CondTy = VectorType::get(CondTy, VF);
  2603. return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
  2604. }
  2605. case Instruction::ICmp:
  2606. case Instruction::FCmp: {
  2607. Type *ValTy = I->getOperand(0)->getType();
  2608. VectorTy = ToVectorTy(ValTy, VF);
  2609. return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
  2610. }
  2611. case Instruction::Store: {
  2612. StoreInst *SI = cast<StoreInst>(I);
  2613. Type *ValTy = SI->getValueOperand()->getType();
  2614. VectorTy = ToVectorTy(ValTy, VF);
  2615. if (VF == 1)
  2616. return TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
  2617. SI->getAlignment(),
  2618. SI->getPointerAddressSpace());
  2619. // Scalarized stores.
  2620. int Stride = Legal->isConsecutivePtr(SI->getPointerOperand());
  2621. bool Reverse = Stride < 0;
  2622. if (0 == Stride) {
  2623. unsigned Cost = 0;
  2624. // The cost of extracting from the value vector and pointer vector.
  2625. Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
  2626. for (unsigned i = 0; i < VF; ++i) {
  2627. Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
  2628. i);
  2629. Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
  2630. }
  2631. // The cost of the scalar stores.
  2632. Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
  2633. SI->getAlignment(),
  2634. SI->getPointerAddressSpace());
  2635. return Cost;
  2636. }
  2637. // Wide stores.
  2638. unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
  2639. SI->getAlignment(),
  2640. SI->getPointerAddressSpace());
  2641. if (Reverse)
  2642. Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
  2643. VectorTy, 0);
  2644. return Cost;
  2645. }
  2646. case Instruction::Load: {
  2647. LoadInst *LI = cast<LoadInst>(I);
  2648. if (VF == 1)
  2649. return TTI.getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
  2650. LI->getPointerAddressSpace());
  2651. // Scalarized loads.
  2652. int Stride = Legal->isConsecutivePtr(LI->getPointerOperand());
  2653. bool Reverse = Stride < 0;
  2654. if (0 == Stride) {
  2655. unsigned Cost = 0;
  2656. Type *PtrTy = ToVectorTy(I->getOperand(0)->getType(), VF);
  2657. // The cost of extracting from the pointer vector.
  2658. for (unsigned i = 0; i < VF; ++i)
  2659. Cost += TTI.getVectorInstrCost(Instruction::ExtractElement, PtrTy, i);
  2660. // The cost of inserting data to the result vector.
  2661. for (unsigned i = 0; i < VF; ++i)
  2662. Cost += TTI.getVectorInstrCost(Instruction::InsertElement, VectorTy, i);
  2663. // The cost of the scalar stores.
  2664. Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), RetTy->getScalarType(),
  2665. LI->getAlignment(),
  2666. LI->getPointerAddressSpace());
  2667. return Cost;
  2668. }
  2669. // Wide loads.
  2670. unsigned Cost = TTI.getMemoryOpCost(I->getOpcode(), VectorTy,
  2671. LI->getAlignment(),
  2672. LI->getPointerAddressSpace());
  2673. if (Reverse)
  2674. Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
  2675. return Cost;
  2676. }
  2677. case Instruction::ZExt:
  2678. case Instruction::SExt:
  2679. case Instruction::FPToUI:
  2680. case Instruction::FPToSI:
  2681. case Instruction::FPExt:
  2682. case Instruction::PtrToInt:
  2683. case Instruction::IntToPtr:
  2684. case Instruction::SIToFP:
  2685. case Instruction::UIToFP:
  2686. case Instruction::Trunc:
  2687. case Instruction::FPTrunc:
  2688. case Instruction::BitCast: {
  2689. // We optimize the truncation of induction variable.
  2690. // The cost of these is the same as the scalar operation.
  2691. if (I->getOpcode() == Instruction::Trunc &&
  2692. Legal->isInductionVariable(I->getOperand(0)))
  2693. return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
  2694. I->getOperand(0)->getType());
  2695. Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
  2696. return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
  2697. }
  2698. case Instruction::Call: {
  2699. assert(isTriviallyVectorizableIntrinsic(I));
  2700. IntrinsicInst *II = cast<IntrinsicInst>(I);
  2701. Type *RetTy = ToVectorTy(II->getType(), VF);
  2702. SmallVector<Type*, 4> Tys;
  2703. for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
  2704. Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
  2705. return TTI.getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
  2706. }
  2707. default: {
  2708. // We are scalarizing the instruction. Return the cost of the scalar
  2709. // instruction, plus the cost of insert and extract into vector
  2710. // elements, times the vector width.
  2711. unsigned Cost = 0;
  2712. if (!RetTy->isVoidTy() && VF != 1) {
  2713. unsigned InsCost = TTI.getVectorInstrCost(Instruction::InsertElement,
  2714. VectorTy);
  2715. unsigned ExtCost = TTI.getVectorInstrCost(Instruction::ExtractElement,
  2716. VectorTy);
  2717. // The cost of inserting the results plus extracting each one of the
  2718. // operands.
  2719. Cost += VF * (InsCost + ExtCost * I->getNumOperands());
  2720. }
  2721. // The cost of executing VF copies of the scalar instruction. This opcode
  2722. // is unknown. Assume that it is the same as 'mul'.
  2723. Cost += VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy);
  2724. return Cost;
  2725. }
  2726. }// end of switch.
  2727. }
  2728. Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
  2729. if (Scalar->isVoidTy() || VF == 1)
  2730. return Scalar;
  2731. return VectorType::get(Scalar, VF);
  2732. }
  2733. char LoopVectorize::ID = 0;
  2734. static const char lv_name[] = "Loop Vectorization";
  2735. INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
  2736. INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
  2737. INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
  2738. INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  2739. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  2740. INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
  2741. namespace llvm {
  2742. Pass *createLoopVectorizePass() {
  2743. return new LoopVectorize();
  2744. }
  2745. }