SLPVectorizer.cpp 162 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583
  1. //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10. // stores that can be put together into vector-stores. Next, it attempts to
  11. // construct vectorizable tree using the use-def chains. If a profitable tree
  12. // was found, the SLP vectorizer performs vectorization on the tree.
  13. //
  14. // The pass is inspired by the work described in the paper:
  15. // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/ADT/MapVector.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/PostOrderIterator.h"
  21. #include "llvm/ADT/SetVector.h"
  22. #include "llvm/ADT/Statistic.h"
  23. #include "llvm/Analysis/AliasAnalysis.h"
  24. #include "llvm/Analysis/AssumptionCache.h"
  25. #include "llvm/Analysis/CodeMetrics.h"
  26. #include "llvm/Analysis/DemandedBits.h"
  27. #include "llvm/Analysis/GlobalsModRef.h"
  28. #include "llvm/Analysis/LoopAccessAnalysis.h"
  29. #include "llvm/Analysis/LoopAccessAnalysis.h"
  30. #include "llvm/Analysis/LoopInfo.h"
  31. #include "llvm/Analysis/ScalarEvolution.h"
  32. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  33. #include "llvm/Analysis/TargetTransformInfo.h"
  34. #include "llvm/Analysis/ValueTracking.h"
  35. #include "llvm/Analysis/VectorUtils.h"
  36. #include "llvm/IR/DataLayout.h"
  37. #include "llvm/IR/Dominators.h"
  38. #include "llvm/IR/IRBuilder.h"
  39. #include "llvm/IR/Instructions.h"
  40. #include "llvm/IR/IntrinsicInst.h"
  41. #include "llvm/IR/Module.h"
  42. #include "llvm/IR/NoFolder.h"
  43. #include "llvm/IR/Type.h"
  44. #include "llvm/IR/Value.h"
  45. #include "llvm/IR/Verifier.h"
  46. #include "llvm/Pass.h"
  47. #include "llvm/Support/CommandLine.h"
  48. #include "llvm/Support/Debug.h"
  49. #include "llvm/Support/raw_ostream.h"
  50. #include "llvm/Transforms/Vectorize.h"
  51. #include <algorithm>
  52. #include <memory>
  53. using namespace llvm;
  54. #define SV_NAME "slp-vectorizer"
  55. #define DEBUG_TYPE "SLP"
  56. STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  57. static cl::opt<int>
  58. SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  59. cl::desc("Only vectorize if you gain more than this "
  60. "number "));
  61. static cl::opt<bool>
  62. ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
  63. cl::desc("Attempt to vectorize horizontal reductions"));
  64. static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  65. "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  66. cl::desc(
  67. "Attempt to vectorize horizontal reductions feeding into a store"));
  68. static cl::opt<int>
  69. MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
  70. cl::desc("Attempt to vectorize for this register size in bits"));
  71. /// Limits the size of scheduling regions in a block.
  72. /// It avoid long compile times for _very_ large blocks where vector
  73. /// instructions are spread over a wide range.
  74. /// This limit is way higher than needed by real-world functions.
  75. static cl::opt<int>
  76. ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
  77. cl::desc("Limit the size of the SLP scheduling region per block"));
  78. static cl::opt<int> MinVectorRegSizeOption(
  79. "slp-min-reg-size", cl::init(128), cl::Hidden,
  80. cl::desc("Attempt to vectorize for this register size in bits"));
  81. namespace {
  82. // FIXME: Set this via cl::opt to allow overriding.
  83. static const unsigned RecursionMaxDepth = 12;
  84. // Limit the number of alias checks. The limit is chosen so that
  85. // it has no negative effect on the llvm benchmarks.
  86. static const unsigned AliasedCheckLimit = 10;
  87. // Another limit for the alias checks: The maximum distance between load/store
  88. // instructions where alias checks are done.
  89. // This limit is useful for very large basic blocks.
  90. static const unsigned MaxMemDepDistance = 160;
  91. /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
  92. /// regions to be handled.
  93. static const int MinScheduleRegionSize = 16;
  94. /// \brief Predicate for the element types that the SLP vectorizer supports.
  95. ///
  96. /// The most important thing to filter here are types which are invalid in LLVM
  97. /// vectors. We also filter target specific types which have absolutely no
  98. /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
  99. /// avoids spending time checking the cost model and realizing that they will
  100. /// be inevitably scalarized.
  101. static bool isValidElementType(Type *Ty) {
  102. return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
  103. !Ty->isPPC_FP128Ty();
  104. }
  105. /// \returns the parent basic block if all of the instructions in \p VL
  106. /// are in the same block or null otherwise.
  107. static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
  108. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  109. if (!I0)
  110. return nullptr;
  111. BasicBlock *BB = I0->getParent();
  112. for (int i = 1, e = VL.size(); i < e; i++) {
  113. Instruction *I = dyn_cast<Instruction>(VL[i]);
  114. if (!I)
  115. return nullptr;
  116. if (BB != I->getParent())
  117. return nullptr;
  118. }
  119. return BB;
  120. }
  121. /// \returns True if all of the values in \p VL are constants.
  122. static bool allConstant(ArrayRef<Value *> VL) {
  123. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  124. if (!isa<Constant>(VL[i]))
  125. return false;
  126. return true;
  127. }
  128. /// \returns True if all of the values in \p VL are identical.
  129. static bool isSplat(ArrayRef<Value *> VL) {
  130. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  131. if (VL[i] != VL[0])
  132. return false;
  133. return true;
  134. }
  135. ///\returns Opcode that can be clubbed with \p Op to create an alternate
  136. /// sequence which can later be merged as a ShuffleVector instruction.
  137. static unsigned getAltOpcode(unsigned Op) {
  138. switch (Op) {
  139. case Instruction::FAdd:
  140. return Instruction::FSub;
  141. case Instruction::FSub:
  142. return Instruction::FAdd;
  143. case Instruction::Add:
  144. return Instruction::Sub;
  145. case Instruction::Sub:
  146. return Instruction::Add;
  147. default:
  148. return 0;
  149. }
  150. }
  151. ///\returns bool representing if Opcode \p Op can be part
  152. /// of an alternate sequence which can later be merged as
  153. /// a ShuffleVector instruction.
  154. static bool canCombineAsAltInst(unsigned Op) {
  155. return Op == Instruction::FAdd || Op == Instruction::FSub ||
  156. Op == Instruction::Sub || Op == Instruction::Add;
  157. }
  158. /// \returns ShuffleVector instruction if instructions in \p VL have
  159. /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
  160. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
  161. static unsigned isAltInst(ArrayRef<Value *> VL) {
  162. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  163. unsigned Opcode = I0->getOpcode();
  164. unsigned AltOpcode = getAltOpcode(Opcode);
  165. for (int i = 1, e = VL.size(); i < e; i++) {
  166. Instruction *I = dyn_cast<Instruction>(VL[i]);
  167. if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
  168. return 0;
  169. }
  170. return Instruction::ShuffleVector;
  171. }
  172. /// \returns The opcode if all of the Instructions in \p VL have the same
  173. /// opcode, or zero.
  174. static unsigned getSameOpcode(ArrayRef<Value *> VL) {
  175. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  176. if (!I0)
  177. return 0;
  178. unsigned Opcode = I0->getOpcode();
  179. for (int i = 1, e = VL.size(); i < e; i++) {
  180. Instruction *I = dyn_cast<Instruction>(VL[i]);
  181. if (!I || Opcode != I->getOpcode()) {
  182. if (canCombineAsAltInst(Opcode) && i == 1)
  183. return isAltInst(VL);
  184. return 0;
  185. }
  186. }
  187. return Opcode;
  188. }
  189. /// Get the intersection (logical and) of all of the potential IR flags
  190. /// of each scalar operation (VL) that will be converted into a vector (I).
  191. /// Flag set: NSW, NUW, exact, and all of fast-math.
  192. static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
  193. if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
  194. if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
  195. // Intersection is initialized to the 0th scalar,
  196. // so start counting from index '1'.
  197. for (int i = 1, e = VL.size(); i < e; ++i) {
  198. if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
  199. Intersection->andIRFlags(Scalar);
  200. }
  201. VecOp->copyIRFlags(Intersection);
  202. }
  203. }
  204. }
  205. /// \returns \p I after propagating metadata from \p VL.
  206. static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
  207. Instruction *I0 = cast<Instruction>(VL[0]);
  208. SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
  209. I0->getAllMetadataOtherThanDebugLoc(Metadata);
  210. for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
  211. unsigned Kind = Metadata[i].first;
  212. MDNode *MD = Metadata[i].second;
  213. for (int i = 1, e = VL.size(); MD && i != e; i++) {
  214. Instruction *I = cast<Instruction>(VL[i]);
  215. MDNode *IMD = I->getMetadata(Kind);
  216. switch (Kind) {
  217. default:
  218. MD = nullptr; // Remove unknown metadata
  219. break;
  220. case LLVMContext::MD_tbaa:
  221. MD = MDNode::getMostGenericTBAA(MD, IMD);
  222. break;
  223. case LLVMContext::MD_alias_scope:
  224. MD = MDNode::getMostGenericAliasScope(MD, IMD);
  225. break;
  226. case LLVMContext::MD_noalias:
  227. MD = MDNode::intersect(MD, IMD);
  228. break;
  229. case LLVMContext::MD_fpmath:
  230. MD = MDNode::getMostGenericFPMath(MD, IMD);
  231. break;
  232. case LLVMContext::MD_nontemporal:
  233. MD = MDNode::intersect(MD, IMD);
  234. break;
  235. }
  236. }
  237. I->setMetadata(Kind, MD);
  238. }
  239. return I;
  240. }
  241. /// \returns The type that all of the values in \p VL have or null if there
  242. /// are different types.
  243. static Type* getSameType(ArrayRef<Value *> VL) {
  244. Type *Ty = VL[0]->getType();
  245. for (int i = 1, e = VL.size(); i < e; i++)
  246. if (VL[i]->getType() != Ty)
  247. return nullptr;
  248. return Ty;
  249. }
  250. /// \returns True if the ExtractElement instructions in VL can be vectorized
  251. /// to use the original vector.
  252. static bool CanReuseExtract(ArrayRef<Value *> VL) {
  253. assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
  254. // Check if all of the extracts come from the same vector and from the
  255. // correct offset.
  256. Value *VL0 = VL[0];
  257. ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
  258. Value *Vec = E0->getOperand(0);
  259. // We have to extract from the same vector type.
  260. unsigned NElts = Vec->getType()->getVectorNumElements();
  261. if (NElts != VL.size())
  262. return false;
  263. // Check that all of the indices extract from the correct offset.
  264. ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
  265. if (!CI || CI->getZExtValue())
  266. return false;
  267. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  268. ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
  269. ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
  270. if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
  271. return false;
  272. }
  273. return true;
  274. }
  275. /// \returns True if in-tree use also needs extract. This refers to
  276. /// possible scalar operand in vectorized instruction.
  277. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
  278. TargetLibraryInfo *TLI) {
  279. unsigned Opcode = UserInst->getOpcode();
  280. switch (Opcode) {
  281. case Instruction::Load: {
  282. LoadInst *LI = cast<LoadInst>(UserInst);
  283. return (LI->getPointerOperand() == Scalar);
  284. }
  285. case Instruction::Store: {
  286. StoreInst *SI = cast<StoreInst>(UserInst);
  287. return (SI->getPointerOperand() == Scalar);
  288. }
  289. case Instruction::Call: {
  290. CallInst *CI = cast<CallInst>(UserInst);
  291. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  292. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  293. return (CI->getArgOperand(1) == Scalar);
  294. }
  295. }
  296. default:
  297. return false;
  298. }
  299. }
  300. /// \returns the AA location that is being access by the instruction.
  301. static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
  302. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  303. return MemoryLocation::get(SI);
  304. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  305. return MemoryLocation::get(LI);
  306. return MemoryLocation();
  307. }
  308. /// \returns True if the instruction is not a volatile or atomic load/store.
  309. static bool isSimple(Instruction *I) {
  310. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  311. return LI->isSimple();
  312. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  313. return SI->isSimple();
  314. if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
  315. return !MI->isVolatile();
  316. return true;
  317. }
  318. /// Bottom Up SLP Vectorizer.
  319. class BoUpSLP {
  320. public:
  321. typedef SmallVector<Value *, 8> ValueList;
  322. typedef SmallVector<Instruction *, 16> InstrList;
  323. typedef SmallPtrSet<Value *, 16> ValueSet;
  324. typedef SmallVector<StoreInst *, 8> StoreList;
  325. BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
  326. TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
  327. DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
  328. const DataLayout *DL)
  329. : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
  330. SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
  331. DL(DL), Builder(Se->getContext()) {
  332. CodeMetrics::collectEphemeralValues(F, AC, EphValues);
  333. }
  334. /// \brief Vectorize the tree that starts with the elements in \p VL.
  335. /// Returns the vectorized root.
  336. Value *vectorizeTree();
  337. /// \returns the cost incurred by unwanted spills and fills, caused by
  338. /// holding live values over call sites.
  339. int getSpillCost();
  340. /// \returns the vectorization cost of the subtree that starts at \p VL.
  341. /// A negative number means that this is profitable.
  342. int getTreeCost();
  343. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  344. /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
  345. void buildTree(ArrayRef<Value *> Roots,
  346. ArrayRef<Value *> UserIgnoreLst = None);
  347. /// Clear the internal data structures that are created by 'buildTree'.
  348. void deleteTree() {
  349. VectorizableTree.clear();
  350. ScalarToTreeEntry.clear();
  351. MustGather.clear();
  352. ExternalUses.clear();
  353. NumLoadsWantToKeepOrder = 0;
  354. NumLoadsWantToChangeOrder = 0;
  355. for (auto &Iter : BlocksSchedules) {
  356. BlockScheduling *BS = Iter.second.get();
  357. BS->clear();
  358. }
  359. MinBWs.clear();
  360. }
  361. /// \brief Perform LICM and CSE on the newly generated gather sequences.
  362. void optimizeGatherSequence();
  363. /// \returns true if it is beneficial to reverse the vector order.
  364. bool shouldReorder() const {
  365. return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
  366. }
  367. /// \return The vector element size in bits to use when vectorizing the
  368. /// expression tree ending at \p V. If V is a store, the size is the width of
  369. /// the stored value. Otherwise, the size is the width of the largest loaded
  370. /// value reaching V. This method is used by the vectorizer to calculate
  371. /// vectorization factors.
  372. unsigned getVectorElementSize(Value *V);
  373. /// Compute the minimum type sizes required to represent the entries in a
  374. /// vectorizable tree.
  375. void computeMinimumValueSizes();
  376. private:
  377. struct TreeEntry;
  378. /// \returns the cost of the vectorizable entry.
  379. int getEntryCost(TreeEntry *E);
  380. /// This is the recursive part of buildTree.
  381. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
  382. /// Vectorize a single entry in the tree.
  383. Value *vectorizeTree(TreeEntry *E);
  384. /// Vectorize a single entry in the tree, starting in \p VL.
  385. Value *vectorizeTree(ArrayRef<Value *> VL);
  386. /// \returns the pointer to the vectorized value if \p VL is already
  387. /// vectorized, or NULL. They may happen in cycles.
  388. Value *alreadyVectorized(ArrayRef<Value *> VL) const;
  389. /// \returns the scalarization cost for this type. Scalarization in this
  390. /// context means the creation of vectors from a group of scalars.
  391. int getGatherCost(Type *Ty);
  392. /// \returns the scalarization cost for this list of values. Assuming that
  393. /// this subtree gets vectorized, we may need to extract the values from the
  394. /// roots. This method calculates the cost of extracting the values.
  395. int getGatherCost(ArrayRef<Value *> VL);
  396. /// \brief Set the Builder insert point to one after the last instruction in
  397. /// the bundle
  398. void setInsertPointAfterBundle(ArrayRef<Value *> VL);
  399. /// \returns a vector from a collection of scalars in \p VL.
  400. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
  401. /// \returns whether the VectorizableTree is fully vectorizable and will
  402. /// be beneficial even the tree height is tiny.
  403. bool isFullyVectorizableTinyTree();
  404. /// \reorder commutative operands in alt shuffle if they result in
  405. /// vectorized code.
  406. void reorderAltShuffleOperands(ArrayRef<Value *> VL,
  407. SmallVectorImpl<Value *> &Left,
  408. SmallVectorImpl<Value *> &Right);
  409. /// \reorder commutative operands to get better probability of
  410. /// generating vectorized code.
  411. void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  412. SmallVectorImpl<Value *> &Left,
  413. SmallVectorImpl<Value *> &Right);
  414. struct TreeEntry {
  415. TreeEntry() : Scalars(), VectorizedValue(nullptr),
  416. NeedToGather(0) {}
  417. /// \returns true if the scalars in VL are equal to this entry.
  418. bool isSame(ArrayRef<Value *> VL) const {
  419. assert(VL.size() == Scalars.size() && "Invalid size");
  420. return std::equal(VL.begin(), VL.end(), Scalars.begin());
  421. }
  422. /// A vector of scalars.
  423. ValueList Scalars;
  424. /// The Scalars are vectorized into this value. It is initialized to Null.
  425. Value *VectorizedValue;
  426. /// Do we need to gather this sequence ?
  427. bool NeedToGather;
  428. };
  429. /// Create a new VectorizableTree entry.
  430. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
  431. VectorizableTree.emplace_back();
  432. int idx = VectorizableTree.size() - 1;
  433. TreeEntry *Last = &VectorizableTree[idx];
  434. Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
  435. Last->NeedToGather = !Vectorized;
  436. if (Vectorized) {
  437. for (int i = 0, e = VL.size(); i != e; ++i) {
  438. assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
  439. ScalarToTreeEntry[VL[i]] = idx;
  440. }
  441. } else {
  442. MustGather.insert(VL.begin(), VL.end());
  443. }
  444. return Last;
  445. }
  446. /// -- Vectorization State --
  447. /// Holds all of the tree entries.
  448. std::vector<TreeEntry> VectorizableTree;
  449. /// Maps a specific scalar to its tree entry.
  450. SmallDenseMap<Value*, int> ScalarToTreeEntry;
  451. /// A list of scalars that we found that we need to keep as scalars.
  452. ValueSet MustGather;
  453. /// This POD struct describes one external user in the vectorized tree.
  454. struct ExternalUser {
  455. ExternalUser (Value *S, llvm::User *U, int L) :
  456. Scalar(S), User(U), Lane(L){}
  457. // Which scalar in our function.
  458. Value *Scalar;
  459. // Which user that uses the scalar.
  460. llvm::User *User;
  461. // Which lane does the scalar belong to.
  462. int Lane;
  463. };
  464. typedef SmallVector<ExternalUser, 16> UserList;
  465. /// Checks if two instructions may access the same memory.
  466. ///
  467. /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
  468. /// is invariant in the calling loop.
  469. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
  470. Instruction *Inst2) {
  471. // First check if the result is already in the cache.
  472. AliasCacheKey key = std::make_pair(Inst1, Inst2);
  473. Optional<bool> &result = AliasCache[key];
  474. if (result.hasValue()) {
  475. return result.getValue();
  476. }
  477. MemoryLocation Loc2 = getLocation(Inst2, AA);
  478. bool aliased = true;
  479. if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
  480. // Do the alias check.
  481. aliased = AA->alias(Loc1, Loc2);
  482. }
  483. // Store the result in the cache.
  484. result = aliased;
  485. return aliased;
  486. }
  487. typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
  488. /// Cache for alias results.
  489. /// TODO: consider moving this to the AliasAnalysis itself.
  490. DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
  491. /// Removes an instruction from its block and eventually deletes it.
  492. /// It's like Instruction::eraseFromParent() except that the actual deletion
  493. /// is delayed until BoUpSLP is destructed.
  494. /// This is required to ensure that there are no incorrect collisions in the
  495. /// AliasCache, which can happen if a new instruction is allocated at the
  496. /// same address as a previously deleted instruction.
  497. void eraseInstruction(Instruction *I) {
  498. I->removeFromParent();
  499. I->dropAllReferences();
  500. DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
  501. }
  502. /// Temporary store for deleted instructions. Instructions will be deleted
  503. /// eventually when the BoUpSLP is destructed.
  504. SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
  505. /// A list of values that need to extracted out of the tree.
  506. /// This list holds pairs of (Internal Scalar : External User).
  507. UserList ExternalUses;
  508. /// Values used only by @llvm.assume calls.
  509. SmallPtrSet<const Value *, 32> EphValues;
  510. /// Holds all of the instructions that we gathered.
  511. SetVector<Instruction *> GatherSeq;
  512. /// A list of blocks that we are going to CSE.
  513. SetVector<BasicBlock *> CSEBlocks;
  514. /// Contains all scheduling relevant data for an instruction.
  515. /// A ScheduleData either represents a single instruction or a member of an
  516. /// instruction bundle (= a group of instructions which is combined into a
  517. /// vector instruction).
  518. struct ScheduleData {
  519. // The initial value for the dependency counters. It means that the
  520. // dependencies are not calculated yet.
  521. enum { InvalidDeps = -1 };
  522. ScheduleData()
  523. : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
  524. NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
  525. Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
  526. UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
  527. void init(int BlockSchedulingRegionID) {
  528. FirstInBundle = this;
  529. NextInBundle = nullptr;
  530. NextLoadStore = nullptr;
  531. IsScheduled = false;
  532. SchedulingRegionID = BlockSchedulingRegionID;
  533. UnscheduledDepsInBundle = UnscheduledDeps;
  534. clearDependencies();
  535. }
  536. /// Returns true if the dependency information has been calculated.
  537. bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
  538. /// Returns true for single instructions and for bundle representatives
  539. /// (= the head of a bundle).
  540. bool isSchedulingEntity() const { return FirstInBundle == this; }
  541. /// Returns true if it represents an instruction bundle and not only a
  542. /// single instruction.
  543. bool isPartOfBundle() const {
  544. return NextInBundle != nullptr || FirstInBundle != this;
  545. }
  546. /// Returns true if it is ready for scheduling, i.e. it has no more
  547. /// unscheduled depending instructions/bundles.
  548. bool isReady() const {
  549. assert(isSchedulingEntity() &&
  550. "can't consider non-scheduling entity for ready list");
  551. return UnscheduledDepsInBundle == 0 && !IsScheduled;
  552. }
  553. /// Modifies the number of unscheduled dependencies, also updating it for
  554. /// the whole bundle.
  555. int incrementUnscheduledDeps(int Incr) {
  556. UnscheduledDeps += Incr;
  557. return FirstInBundle->UnscheduledDepsInBundle += Incr;
  558. }
  559. /// Sets the number of unscheduled dependencies to the number of
  560. /// dependencies.
  561. void resetUnscheduledDeps() {
  562. incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
  563. }
  564. /// Clears all dependency information.
  565. void clearDependencies() {
  566. Dependencies = InvalidDeps;
  567. resetUnscheduledDeps();
  568. MemoryDependencies.clear();
  569. }
  570. void dump(raw_ostream &os) const {
  571. if (!isSchedulingEntity()) {
  572. os << "/ " << *Inst;
  573. } else if (NextInBundle) {
  574. os << '[' << *Inst;
  575. ScheduleData *SD = NextInBundle;
  576. while (SD) {
  577. os << ';' << *SD->Inst;
  578. SD = SD->NextInBundle;
  579. }
  580. os << ']';
  581. } else {
  582. os << *Inst;
  583. }
  584. }
  585. Instruction *Inst;
  586. /// Points to the head in an instruction bundle (and always to this for
  587. /// single instructions).
  588. ScheduleData *FirstInBundle;
  589. /// Single linked list of all instructions in a bundle. Null if it is a
  590. /// single instruction.
  591. ScheduleData *NextInBundle;
  592. /// Single linked list of all memory instructions (e.g. load, store, call)
  593. /// in the block - until the end of the scheduling region.
  594. ScheduleData *NextLoadStore;
  595. /// The dependent memory instructions.
  596. /// This list is derived on demand in calculateDependencies().
  597. SmallVector<ScheduleData *, 4> MemoryDependencies;
  598. /// This ScheduleData is in the current scheduling region if this matches
  599. /// the current SchedulingRegionID of BlockScheduling.
  600. int SchedulingRegionID;
  601. /// Used for getting a "good" final ordering of instructions.
  602. int SchedulingPriority;
  603. /// The number of dependencies. Constitutes of the number of users of the
  604. /// instruction plus the number of dependent memory instructions (if any).
  605. /// This value is calculated on demand.
  606. /// If InvalidDeps, the number of dependencies is not calculated yet.
  607. ///
  608. int Dependencies;
  609. /// The number of dependencies minus the number of dependencies of scheduled
  610. /// instructions. As soon as this is zero, the instruction/bundle gets ready
  611. /// for scheduling.
  612. /// Note that this is negative as long as Dependencies is not calculated.
  613. int UnscheduledDeps;
  614. /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
  615. /// single instructions.
  616. int UnscheduledDepsInBundle;
  617. /// True if this instruction is scheduled (or considered as scheduled in the
  618. /// dry-run).
  619. bool IsScheduled;
  620. };
  621. #ifndef NDEBUG
  622. friend raw_ostream &operator<<(raw_ostream &os,
  623. const BoUpSLP::ScheduleData &SD);
  624. #endif
  625. /// Contains all scheduling data for a basic block.
  626. ///
  627. struct BlockScheduling {
  628. BlockScheduling(BasicBlock *BB)
  629. : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
  630. ScheduleStart(nullptr), ScheduleEnd(nullptr),
  631. FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
  632. ScheduleRegionSize(0),
  633. ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
  634. // Make sure that the initial SchedulingRegionID is greater than the
  635. // initial SchedulingRegionID in ScheduleData (which is 0).
  636. SchedulingRegionID(1) {}
  637. void clear() {
  638. ReadyInsts.clear();
  639. ScheduleStart = nullptr;
  640. ScheduleEnd = nullptr;
  641. FirstLoadStoreInRegion = nullptr;
  642. LastLoadStoreInRegion = nullptr;
  643. // Reduce the maximum schedule region size by the size of the
  644. // previous scheduling run.
  645. ScheduleRegionSizeLimit -= ScheduleRegionSize;
  646. if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
  647. ScheduleRegionSizeLimit = MinScheduleRegionSize;
  648. ScheduleRegionSize = 0;
  649. // Make a new scheduling region, i.e. all existing ScheduleData is not
  650. // in the new region yet.
  651. ++SchedulingRegionID;
  652. }
  653. ScheduleData *getScheduleData(Value *V) {
  654. ScheduleData *SD = ScheduleDataMap[V];
  655. if (SD && SD->SchedulingRegionID == SchedulingRegionID)
  656. return SD;
  657. return nullptr;
  658. }
  659. bool isInSchedulingRegion(ScheduleData *SD) {
  660. return SD->SchedulingRegionID == SchedulingRegionID;
  661. }
  662. /// Marks an instruction as scheduled and puts all dependent ready
  663. /// instructions into the ready-list.
  664. template <typename ReadyListType>
  665. void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
  666. SD->IsScheduled = true;
  667. DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
  668. ScheduleData *BundleMember = SD;
  669. while (BundleMember) {
  670. // Handle the def-use chain dependencies.
  671. for (Use &U : BundleMember->Inst->operands()) {
  672. ScheduleData *OpDef = getScheduleData(U.get());
  673. if (OpDef && OpDef->hasValidDependencies() &&
  674. OpDef->incrementUnscheduledDeps(-1) == 0) {
  675. // There are no more unscheduled dependencies after decrementing,
  676. // so we can put the dependent instruction into the ready list.
  677. ScheduleData *DepBundle = OpDef->FirstInBundle;
  678. assert(!DepBundle->IsScheduled &&
  679. "already scheduled bundle gets ready");
  680. ReadyList.insert(DepBundle);
  681. DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
  682. }
  683. }
  684. // Handle the memory dependencies.
  685. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
  686. if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
  687. // There are no more unscheduled dependencies after decrementing,
  688. // so we can put the dependent instruction into the ready list.
  689. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
  690. assert(!DepBundle->IsScheduled &&
  691. "already scheduled bundle gets ready");
  692. ReadyList.insert(DepBundle);
  693. DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
  694. }
  695. }
  696. BundleMember = BundleMember->NextInBundle;
  697. }
  698. }
  699. /// Put all instructions into the ReadyList which are ready for scheduling.
  700. template <typename ReadyListType>
  701. void initialFillReadyList(ReadyListType &ReadyList) {
  702. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  703. ScheduleData *SD = getScheduleData(I);
  704. if (SD->isSchedulingEntity() && SD->isReady()) {
  705. ReadyList.insert(SD);
  706. DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
  707. }
  708. }
  709. }
  710. /// Checks if a bundle of instructions can be scheduled, i.e. has no
  711. /// cyclic dependencies. This is only a dry-run, no instructions are
  712. /// actually moved at this stage.
  713. bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
  714. /// Un-bundles a group of instructions.
  715. void cancelScheduling(ArrayRef<Value *> VL);
  716. /// Extends the scheduling region so that V is inside the region.
  717. /// \returns true if the region size is within the limit.
  718. bool extendSchedulingRegion(Value *V);
  719. /// Initialize the ScheduleData structures for new instructions in the
  720. /// scheduling region.
  721. void initScheduleData(Instruction *FromI, Instruction *ToI,
  722. ScheduleData *PrevLoadStore,
  723. ScheduleData *NextLoadStore);
  724. /// Updates the dependency information of a bundle and of all instructions/
  725. /// bundles which depend on the original bundle.
  726. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
  727. BoUpSLP *SLP);
  728. /// Sets all instruction in the scheduling region to un-scheduled.
  729. void resetSchedule();
  730. BasicBlock *BB;
  731. /// Simple memory allocation for ScheduleData.
  732. std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
  733. /// The size of a ScheduleData array in ScheduleDataChunks.
  734. int ChunkSize;
  735. /// The allocator position in the current chunk, which is the last entry
  736. /// of ScheduleDataChunks.
  737. int ChunkPos;
  738. /// Attaches ScheduleData to Instruction.
  739. /// Note that the mapping survives during all vectorization iterations, i.e.
  740. /// ScheduleData structures are recycled.
  741. DenseMap<Value *, ScheduleData *> ScheduleDataMap;
  742. struct ReadyList : SmallVector<ScheduleData *, 8> {
  743. void insert(ScheduleData *SD) { push_back(SD); }
  744. };
  745. /// The ready-list for scheduling (only used for the dry-run).
  746. ReadyList ReadyInsts;
  747. /// The first instruction of the scheduling region.
  748. Instruction *ScheduleStart;
  749. /// The first instruction _after_ the scheduling region.
  750. Instruction *ScheduleEnd;
  751. /// The first memory accessing instruction in the scheduling region
  752. /// (can be null).
  753. ScheduleData *FirstLoadStoreInRegion;
  754. /// The last memory accessing instruction in the scheduling region
  755. /// (can be null).
  756. ScheduleData *LastLoadStoreInRegion;
  757. /// The current size of the scheduling region.
  758. int ScheduleRegionSize;
  759. /// The maximum size allowed for the scheduling region.
  760. int ScheduleRegionSizeLimit;
  761. /// The ID of the scheduling region. For a new vectorization iteration this
  762. /// is incremented which "removes" all ScheduleData from the region.
  763. int SchedulingRegionID;
  764. };
  765. /// Attaches the BlockScheduling structures to basic blocks.
  766. MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
  767. /// Performs the "real" scheduling. Done before vectorization is actually
  768. /// performed in a basic block.
  769. void scheduleBlock(BlockScheduling *BS);
  770. /// List of users to ignore during scheduling and that don't need extracting.
  771. ArrayRef<Value *> UserIgnoreList;
  772. // Number of load-bundles, which contain consecutive loads.
  773. int NumLoadsWantToKeepOrder;
  774. // Number of load-bundles of size 2, which are consecutive loads if reversed.
  775. int NumLoadsWantToChangeOrder;
  776. // Analysis and block reference.
  777. Function *F;
  778. ScalarEvolution *SE;
  779. TargetTransformInfo *TTI;
  780. TargetLibraryInfo *TLI;
  781. AliasAnalysis *AA;
  782. LoopInfo *LI;
  783. DominatorTree *DT;
  784. AssumptionCache *AC;
  785. DemandedBits *DB;
  786. const DataLayout *DL;
  787. /// Instruction builder to construct the vectorized tree.
  788. IRBuilder<> Builder;
  789. /// A map of scalar integer values to the smallest bit width with which they
  790. /// can legally be represented.
  791. MapVector<Value *, uint64_t> MinBWs;
  792. };
  793. #ifndef NDEBUG
  794. raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
  795. SD.dump(os);
  796. return os;
  797. }
  798. #endif
  799. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  800. ArrayRef<Value *> UserIgnoreLst) {
  801. deleteTree();
  802. UserIgnoreList = UserIgnoreLst;
  803. if (!getSameType(Roots))
  804. return;
  805. buildTree_rec(Roots, 0);
  806. // Collect the values that we need to extract from the tree.
  807. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  808. TreeEntry *Entry = &VectorizableTree[EIdx];
  809. // For each lane:
  810. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  811. Value *Scalar = Entry->Scalars[Lane];
  812. // No need to handle users of gathered values.
  813. if (Entry->NeedToGather)
  814. continue;
  815. for (User *U : Scalar->users()) {
  816. DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
  817. Instruction *UserInst = dyn_cast<Instruction>(U);
  818. if (!UserInst)
  819. continue;
  820. // Skip in-tree scalars that become vectors
  821. if (ScalarToTreeEntry.count(U)) {
  822. int Idx = ScalarToTreeEntry[U];
  823. TreeEntry *UseEntry = &VectorizableTree[Idx];
  824. Value *UseScalar = UseEntry->Scalars[0];
  825. // Some in-tree scalars will remain as scalar in vectorized
  826. // instructions. If that is the case, the one in Lane 0 will
  827. // be used.
  828. if (UseScalar != U ||
  829. !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
  830. DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
  831. << ".\n");
  832. assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
  833. continue;
  834. }
  835. }
  836. // Ignore users in the user ignore list.
  837. if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
  838. UserIgnoreList.end())
  839. continue;
  840. DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
  841. Lane << " from " << *Scalar << ".\n");
  842. ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
  843. }
  844. }
  845. }
  846. }
  847. void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  848. bool SameTy = getSameType(VL); (void)SameTy;
  849. bool isAltShuffle = false;
  850. assert(SameTy && "Invalid types!");
  851. if (Depth == RecursionMaxDepth) {
  852. DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
  853. newTreeEntry(VL, false);
  854. return;
  855. }
  856. // Don't handle vectors.
  857. if (VL[0]->getType()->isVectorTy()) {
  858. DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
  859. newTreeEntry(VL, false);
  860. return;
  861. }
  862. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  863. if (SI->getValueOperand()->getType()->isVectorTy()) {
  864. DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
  865. newTreeEntry(VL, false);
  866. return;
  867. }
  868. unsigned Opcode = getSameOpcode(VL);
  869. // Check that this shuffle vector refers to the alternate
  870. // sequence of opcodes.
  871. if (Opcode == Instruction::ShuffleVector) {
  872. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  873. unsigned Op = I0->getOpcode();
  874. if (Op != Instruction::ShuffleVector)
  875. isAltShuffle = true;
  876. }
  877. // If all of the operands are identical or constant we have a simple solution.
  878. if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
  879. DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
  880. newTreeEntry(VL, false);
  881. return;
  882. }
  883. // We now know that this is a vector of instructions of the same type from
  884. // the same block.
  885. // Don't vectorize ephemeral values.
  886. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  887. if (EphValues.count(VL[i])) {
  888. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  889. ") is ephemeral.\n");
  890. newTreeEntry(VL, false);
  891. return;
  892. }
  893. }
  894. // Check if this is a duplicate of another entry.
  895. if (ScalarToTreeEntry.count(VL[0])) {
  896. int Idx = ScalarToTreeEntry[VL[0]];
  897. TreeEntry *E = &VectorizableTree[Idx];
  898. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  899. DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
  900. if (E->Scalars[i] != VL[i]) {
  901. DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
  902. newTreeEntry(VL, false);
  903. return;
  904. }
  905. }
  906. DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
  907. return;
  908. }
  909. // Check that none of the instructions in the bundle are already in the tree.
  910. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  911. if (ScalarToTreeEntry.count(VL[i])) {
  912. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  913. ") is already in tree.\n");
  914. newTreeEntry(VL, false);
  915. return;
  916. }
  917. }
  918. // If any of the scalars is marked as a value that needs to stay scalar then
  919. // we need to gather the scalars.
  920. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  921. if (MustGather.count(VL[i])) {
  922. DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
  923. newTreeEntry(VL, false);
  924. return;
  925. }
  926. }
  927. // Check that all of the users of the scalars that we want to vectorize are
  928. // schedulable.
  929. Instruction *VL0 = cast<Instruction>(VL[0]);
  930. BasicBlock *BB = cast<Instruction>(VL0)->getParent();
  931. if (!DT->isReachableFromEntry(BB)) {
  932. // Don't go into unreachable blocks. They may contain instructions with
  933. // dependency cycles which confuse the final scheduling.
  934. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
  935. newTreeEntry(VL, false);
  936. return;
  937. }
  938. // Check that every instructions appears once in this bundle.
  939. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  940. for (unsigned j = i+1; j < e; ++j)
  941. if (VL[i] == VL[j]) {
  942. DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
  943. newTreeEntry(VL, false);
  944. return;
  945. }
  946. auto &BSRef = BlocksSchedules[BB];
  947. if (!BSRef) {
  948. BSRef = llvm::make_unique<BlockScheduling>(BB);
  949. }
  950. BlockScheduling &BS = *BSRef.get();
  951. if (!BS.tryScheduleBundle(VL, this)) {
  952. DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
  953. assert((!BS.getScheduleData(VL[0]) ||
  954. !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
  955. "tryScheduleBundle should cancelScheduling on failure");
  956. newTreeEntry(VL, false);
  957. return;
  958. }
  959. DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
  960. switch (Opcode) {
  961. case Instruction::PHI: {
  962. PHINode *PH = dyn_cast<PHINode>(VL0);
  963. // Check for terminator values (e.g. invoke).
  964. for (unsigned j = 0; j < VL.size(); ++j)
  965. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  966. TerminatorInst *Term = dyn_cast<TerminatorInst>(
  967. cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
  968. if (Term) {
  969. DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
  970. BS.cancelScheduling(VL);
  971. newTreeEntry(VL, false);
  972. return;
  973. }
  974. }
  975. newTreeEntry(VL, true);
  976. DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
  977. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  978. ValueList Operands;
  979. // Prepare the operand vector.
  980. for (unsigned j = 0; j < VL.size(); ++j)
  981. Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
  982. PH->getIncomingBlock(i)));
  983. buildTree_rec(Operands, Depth + 1);
  984. }
  985. return;
  986. }
  987. case Instruction::ExtractElement: {
  988. bool Reuse = CanReuseExtract(VL);
  989. if (Reuse) {
  990. DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
  991. } else {
  992. BS.cancelScheduling(VL);
  993. }
  994. newTreeEntry(VL, Reuse);
  995. return;
  996. }
  997. case Instruction::Load: {
  998. // Check that a vectorized load would load the same memory as a scalar
  999. // load.
  1000. // For example we don't want vectorize loads that are smaller than 8 bit.
  1001. // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
  1002. // loading/storing it as an i8 struct. If we vectorize loads/stores from
  1003. // such a struct we read/write packed bits disagreeing with the
  1004. // unvectorized version.
  1005. Type *ScalarTy = VL[0]->getType();
  1006. if (DL->getTypeSizeInBits(ScalarTy) !=
  1007. DL->getTypeAllocSizeInBits(ScalarTy)) {
  1008. BS.cancelScheduling(VL);
  1009. newTreeEntry(VL, false);
  1010. DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
  1011. return;
  1012. }
  1013. // Check if the loads are consecutive or of we need to swizzle them.
  1014. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  1015. LoadInst *L = cast<LoadInst>(VL[i]);
  1016. if (!L->isSimple()) {
  1017. BS.cancelScheduling(VL);
  1018. newTreeEntry(VL, false);
  1019. DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
  1020. return;
  1021. }
  1022. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1023. if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
  1024. ++NumLoadsWantToChangeOrder;
  1025. }
  1026. BS.cancelScheduling(VL);
  1027. newTreeEntry(VL, false);
  1028. DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
  1029. return;
  1030. }
  1031. }
  1032. ++NumLoadsWantToKeepOrder;
  1033. newTreeEntry(VL, true);
  1034. DEBUG(dbgs() << "SLP: added a vector of loads.\n");
  1035. return;
  1036. }
  1037. case Instruction::ZExt:
  1038. case Instruction::SExt:
  1039. case Instruction::FPToUI:
  1040. case Instruction::FPToSI:
  1041. case Instruction::FPExt:
  1042. case Instruction::PtrToInt:
  1043. case Instruction::IntToPtr:
  1044. case Instruction::SIToFP:
  1045. case Instruction::UIToFP:
  1046. case Instruction::Trunc:
  1047. case Instruction::FPTrunc:
  1048. case Instruction::BitCast: {
  1049. Type *SrcTy = VL0->getOperand(0)->getType();
  1050. for (unsigned i = 0; i < VL.size(); ++i) {
  1051. Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
  1052. if (Ty != SrcTy || !isValidElementType(Ty)) {
  1053. BS.cancelScheduling(VL);
  1054. newTreeEntry(VL, false);
  1055. DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
  1056. return;
  1057. }
  1058. }
  1059. newTreeEntry(VL, true);
  1060. DEBUG(dbgs() << "SLP: added a vector of casts.\n");
  1061. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1062. ValueList Operands;
  1063. // Prepare the operand vector.
  1064. for (unsigned j = 0; j < VL.size(); ++j)
  1065. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1066. buildTree_rec(Operands, Depth+1);
  1067. }
  1068. return;
  1069. }
  1070. case Instruction::ICmp:
  1071. case Instruction::FCmp: {
  1072. // Check that all of the compares have the same predicate.
  1073. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1074. Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
  1075. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  1076. CmpInst *Cmp = cast<CmpInst>(VL[i]);
  1077. if (Cmp->getPredicate() != P0 ||
  1078. Cmp->getOperand(0)->getType() != ComparedTy) {
  1079. BS.cancelScheduling(VL);
  1080. newTreeEntry(VL, false);
  1081. DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
  1082. return;
  1083. }
  1084. }
  1085. newTreeEntry(VL, true);
  1086. DEBUG(dbgs() << "SLP: added a vector of compares.\n");
  1087. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1088. ValueList Operands;
  1089. // Prepare the operand vector.
  1090. for (unsigned j = 0; j < VL.size(); ++j)
  1091. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1092. buildTree_rec(Operands, Depth+1);
  1093. }
  1094. return;
  1095. }
  1096. case Instruction::Select:
  1097. case Instruction::Add:
  1098. case Instruction::FAdd:
  1099. case Instruction::Sub:
  1100. case Instruction::FSub:
  1101. case Instruction::Mul:
  1102. case Instruction::FMul:
  1103. case Instruction::UDiv:
  1104. case Instruction::SDiv:
  1105. case Instruction::FDiv:
  1106. case Instruction::URem:
  1107. case Instruction::SRem:
  1108. case Instruction::FRem:
  1109. case Instruction::Shl:
  1110. case Instruction::LShr:
  1111. case Instruction::AShr:
  1112. case Instruction::And:
  1113. case Instruction::Or:
  1114. case Instruction::Xor: {
  1115. newTreeEntry(VL, true);
  1116. DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
  1117. // Sort operands of the instructions so that each side is more likely to
  1118. // have the same opcode.
  1119. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
  1120. ValueList Left, Right;
  1121. reorderInputsAccordingToOpcode(VL, Left, Right);
  1122. buildTree_rec(Left, Depth + 1);
  1123. buildTree_rec(Right, Depth + 1);
  1124. return;
  1125. }
  1126. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1127. ValueList Operands;
  1128. // Prepare the operand vector.
  1129. for (unsigned j = 0; j < VL.size(); ++j)
  1130. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1131. buildTree_rec(Operands, Depth+1);
  1132. }
  1133. return;
  1134. }
  1135. case Instruction::GetElementPtr: {
  1136. // We don't combine GEPs with complicated (nested) indexing.
  1137. for (unsigned j = 0; j < VL.size(); ++j) {
  1138. if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
  1139. DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
  1140. BS.cancelScheduling(VL);
  1141. newTreeEntry(VL, false);
  1142. return;
  1143. }
  1144. }
  1145. // We can't combine several GEPs into one vector if they operate on
  1146. // different types.
  1147. Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
  1148. for (unsigned j = 0; j < VL.size(); ++j) {
  1149. Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
  1150. if (Ty0 != CurTy) {
  1151. DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
  1152. BS.cancelScheduling(VL);
  1153. newTreeEntry(VL, false);
  1154. return;
  1155. }
  1156. }
  1157. // We don't combine GEPs with non-constant indexes.
  1158. for (unsigned j = 0; j < VL.size(); ++j) {
  1159. auto Op = cast<Instruction>(VL[j])->getOperand(1);
  1160. if (!isa<ConstantInt>(Op)) {
  1161. DEBUG(
  1162. dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
  1163. BS.cancelScheduling(VL);
  1164. newTreeEntry(VL, false);
  1165. return;
  1166. }
  1167. }
  1168. newTreeEntry(VL, true);
  1169. DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
  1170. for (unsigned i = 0, e = 2; i < e; ++i) {
  1171. ValueList Operands;
  1172. // Prepare the operand vector.
  1173. for (unsigned j = 0; j < VL.size(); ++j)
  1174. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1175. buildTree_rec(Operands, Depth + 1);
  1176. }
  1177. return;
  1178. }
  1179. case Instruction::Store: {
  1180. // Check if the stores are consecutive or of we need to swizzle them.
  1181. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
  1182. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1183. BS.cancelScheduling(VL);
  1184. newTreeEntry(VL, false);
  1185. DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
  1186. return;
  1187. }
  1188. newTreeEntry(VL, true);
  1189. DEBUG(dbgs() << "SLP: added a vector of stores.\n");
  1190. ValueList Operands;
  1191. for (unsigned j = 0; j < VL.size(); ++j)
  1192. Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
  1193. buildTree_rec(Operands, Depth + 1);
  1194. return;
  1195. }
  1196. case Instruction::Call: {
  1197. // Check if the calls are all to the same vectorizable intrinsic.
  1198. CallInst *CI = cast<CallInst>(VL[0]);
  1199. // Check if this is an Intrinsic call or something that can be
  1200. // represented by an intrinsic call
  1201. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1202. if (!isTriviallyVectorizable(ID)) {
  1203. BS.cancelScheduling(VL);
  1204. newTreeEntry(VL, false);
  1205. DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
  1206. return;
  1207. }
  1208. Function *Int = CI->getCalledFunction();
  1209. Value *A1I = nullptr;
  1210. if (hasVectorInstrinsicScalarOpd(ID, 1))
  1211. A1I = CI->getArgOperand(1);
  1212. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1213. CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
  1214. if (!CI2 || CI2->getCalledFunction() != Int ||
  1215. getVectorIntrinsicIDForCall(CI2, TLI) != ID) {
  1216. BS.cancelScheduling(VL);
  1217. newTreeEntry(VL, false);
  1218. DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
  1219. << "\n");
  1220. return;
  1221. }
  1222. // ctlz,cttz and powi are special intrinsics whose second argument
  1223. // should be same in order for them to be vectorized.
  1224. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  1225. Value *A1J = CI2->getArgOperand(1);
  1226. if (A1I != A1J) {
  1227. BS.cancelScheduling(VL);
  1228. newTreeEntry(VL, false);
  1229. DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
  1230. << " argument "<< A1I<<"!=" << A1J
  1231. << "\n");
  1232. return;
  1233. }
  1234. }
  1235. }
  1236. newTreeEntry(VL, true);
  1237. for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
  1238. ValueList Operands;
  1239. // Prepare the operand vector.
  1240. for (unsigned j = 0; j < VL.size(); ++j) {
  1241. CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
  1242. Operands.push_back(CI2->getArgOperand(i));
  1243. }
  1244. buildTree_rec(Operands, Depth + 1);
  1245. }
  1246. return;
  1247. }
  1248. case Instruction::ShuffleVector: {
  1249. // If this is not an alternate sequence of opcode like add-sub
  1250. // then do not vectorize this instruction.
  1251. if (!isAltShuffle) {
  1252. BS.cancelScheduling(VL);
  1253. newTreeEntry(VL, false);
  1254. DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
  1255. return;
  1256. }
  1257. newTreeEntry(VL, true);
  1258. DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
  1259. // Reorder operands if reordering would enable vectorization.
  1260. if (isa<BinaryOperator>(VL0)) {
  1261. ValueList Left, Right;
  1262. reorderAltShuffleOperands(VL, Left, Right);
  1263. buildTree_rec(Left, Depth + 1);
  1264. buildTree_rec(Right, Depth + 1);
  1265. return;
  1266. }
  1267. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1268. ValueList Operands;
  1269. // Prepare the operand vector.
  1270. for (unsigned j = 0; j < VL.size(); ++j)
  1271. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  1272. buildTree_rec(Operands, Depth + 1);
  1273. }
  1274. return;
  1275. }
  1276. default:
  1277. BS.cancelScheduling(VL);
  1278. newTreeEntry(VL, false);
  1279. DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
  1280. return;
  1281. }
  1282. }
  1283. int BoUpSLP::getEntryCost(TreeEntry *E) {
  1284. ArrayRef<Value*> VL = E->Scalars;
  1285. Type *ScalarTy = VL[0]->getType();
  1286. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1287. ScalarTy = SI->getValueOperand()->getType();
  1288. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1289. // If we have computed a smaller type for the expression, update VecTy so
  1290. // that the costs will be accurate.
  1291. if (MinBWs.count(VL[0]))
  1292. VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
  1293. VL.size());
  1294. if (E->NeedToGather) {
  1295. if (allConstant(VL))
  1296. return 0;
  1297. if (isSplat(VL)) {
  1298. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  1299. }
  1300. return getGatherCost(E->Scalars);
  1301. }
  1302. unsigned Opcode = getSameOpcode(VL);
  1303. assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
  1304. Instruction *VL0 = cast<Instruction>(VL[0]);
  1305. switch (Opcode) {
  1306. case Instruction::PHI: {
  1307. return 0;
  1308. }
  1309. case Instruction::ExtractElement: {
  1310. if (CanReuseExtract(VL)) {
  1311. int DeadCost = 0;
  1312. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1313. ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
  1314. if (E->hasOneUse())
  1315. // Take credit for instruction that will become dead.
  1316. DeadCost +=
  1317. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
  1318. }
  1319. return -DeadCost;
  1320. }
  1321. return getGatherCost(VecTy);
  1322. }
  1323. case Instruction::ZExt:
  1324. case Instruction::SExt:
  1325. case Instruction::FPToUI:
  1326. case Instruction::FPToSI:
  1327. case Instruction::FPExt:
  1328. case Instruction::PtrToInt:
  1329. case Instruction::IntToPtr:
  1330. case Instruction::SIToFP:
  1331. case Instruction::UIToFP:
  1332. case Instruction::Trunc:
  1333. case Instruction::FPTrunc:
  1334. case Instruction::BitCast: {
  1335. Type *SrcTy = VL0->getOperand(0)->getType();
  1336. // Calculate the cost of this instruction.
  1337. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
  1338. VL0->getType(), SrcTy);
  1339. VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
  1340. int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
  1341. return VecCost - ScalarCost;
  1342. }
  1343. case Instruction::FCmp:
  1344. case Instruction::ICmp:
  1345. case Instruction::Select:
  1346. case Instruction::Add:
  1347. case Instruction::FAdd:
  1348. case Instruction::Sub:
  1349. case Instruction::FSub:
  1350. case Instruction::Mul:
  1351. case Instruction::FMul:
  1352. case Instruction::UDiv:
  1353. case Instruction::SDiv:
  1354. case Instruction::FDiv:
  1355. case Instruction::URem:
  1356. case Instruction::SRem:
  1357. case Instruction::FRem:
  1358. case Instruction::Shl:
  1359. case Instruction::LShr:
  1360. case Instruction::AShr:
  1361. case Instruction::And:
  1362. case Instruction::Or:
  1363. case Instruction::Xor: {
  1364. // Calculate the cost of this instruction.
  1365. int ScalarCost = 0;
  1366. int VecCost = 0;
  1367. if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
  1368. Opcode == Instruction::Select) {
  1369. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
  1370. ScalarCost = VecTy->getNumElements() *
  1371. TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
  1372. VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
  1373. } else {
  1374. // Certain instructions can be cheaper to vectorize if they have a
  1375. // constant second vector operand.
  1376. TargetTransformInfo::OperandValueKind Op1VK =
  1377. TargetTransformInfo::OK_AnyValue;
  1378. TargetTransformInfo::OperandValueKind Op2VK =
  1379. TargetTransformInfo::OK_UniformConstantValue;
  1380. TargetTransformInfo::OperandValueProperties Op1VP =
  1381. TargetTransformInfo::OP_None;
  1382. TargetTransformInfo::OperandValueProperties Op2VP =
  1383. TargetTransformInfo::OP_None;
  1384. // If all operands are exactly the same ConstantInt then set the
  1385. // operand kind to OK_UniformConstantValue.
  1386. // If instead not all operands are constants, then set the operand kind
  1387. // to OK_AnyValue. If all operands are constants but not the same,
  1388. // then set the operand kind to OK_NonUniformConstantValue.
  1389. ConstantInt *CInt = nullptr;
  1390. for (unsigned i = 0; i < VL.size(); ++i) {
  1391. const Instruction *I = cast<Instruction>(VL[i]);
  1392. if (!isa<ConstantInt>(I->getOperand(1))) {
  1393. Op2VK = TargetTransformInfo::OK_AnyValue;
  1394. break;
  1395. }
  1396. if (i == 0) {
  1397. CInt = cast<ConstantInt>(I->getOperand(1));
  1398. continue;
  1399. }
  1400. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
  1401. CInt != cast<ConstantInt>(I->getOperand(1)))
  1402. Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
  1403. }
  1404. // FIXME: Currently cost of model modification for division by power of
  1405. // 2 is handled for X86 and AArch64. Add support for other targets.
  1406. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
  1407. CInt->getValue().isPowerOf2())
  1408. Op2VP = TargetTransformInfo::OP_PowerOf2;
  1409. ScalarCost = VecTy->getNumElements() *
  1410. TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
  1411. Op1VP, Op2VP);
  1412. VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
  1413. Op1VP, Op2VP);
  1414. }
  1415. return VecCost - ScalarCost;
  1416. }
  1417. case Instruction::GetElementPtr: {
  1418. TargetTransformInfo::OperandValueKind Op1VK =
  1419. TargetTransformInfo::OK_AnyValue;
  1420. TargetTransformInfo::OperandValueKind Op2VK =
  1421. TargetTransformInfo::OK_UniformConstantValue;
  1422. int ScalarCost =
  1423. VecTy->getNumElements() *
  1424. TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
  1425. int VecCost =
  1426. TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
  1427. return VecCost - ScalarCost;
  1428. }
  1429. case Instruction::Load: {
  1430. // Cost of wide load - cost of scalar loads.
  1431. int ScalarLdCost = VecTy->getNumElements() *
  1432. TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
  1433. int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
  1434. return VecLdCost - ScalarLdCost;
  1435. }
  1436. case Instruction::Store: {
  1437. // We know that we can merge the stores. Calculate the cost.
  1438. int ScalarStCost = VecTy->getNumElements() *
  1439. TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
  1440. int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
  1441. return VecStCost - ScalarStCost;
  1442. }
  1443. case Instruction::Call: {
  1444. CallInst *CI = cast<CallInst>(VL0);
  1445. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1446. // Calculate the cost of the scalar and vector calls.
  1447. SmallVector<Type*, 4> ScalarTys, VecTys;
  1448. for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
  1449. ScalarTys.push_back(CI->getArgOperand(op)->getType());
  1450. VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
  1451. VecTy->getNumElements()));
  1452. }
  1453. FastMathFlags FMF;
  1454. if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
  1455. FMF = FPMO->getFastMathFlags();
  1456. int ScalarCallCost = VecTy->getNumElements() *
  1457. TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
  1458. int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
  1459. DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
  1460. << " (" << VecCallCost << "-" << ScalarCallCost << ")"
  1461. << " for " << *CI << "\n");
  1462. return VecCallCost - ScalarCallCost;
  1463. }
  1464. case Instruction::ShuffleVector: {
  1465. TargetTransformInfo::OperandValueKind Op1VK =
  1466. TargetTransformInfo::OK_AnyValue;
  1467. TargetTransformInfo::OperandValueKind Op2VK =
  1468. TargetTransformInfo::OK_AnyValue;
  1469. int ScalarCost = 0;
  1470. int VecCost = 0;
  1471. for (unsigned i = 0; i < VL.size(); ++i) {
  1472. Instruction *I = cast<Instruction>(VL[i]);
  1473. if (!I)
  1474. break;
  1475. ScalarCost +=
  1476. TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
  1477. }
  1478. // VecCost is equal to sum of the cost of creating 2 vectors
  1479. // and the cost of creating shuffle.
  1480. Instruction *I0 = cast<Instruction>(VL[0]);
  1481. VecCost =
  1482. TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
  1483. Instruction *I1 = cast<Instruction>(VL[1]);
  1484. VecCost +=
  1485. TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
  1486. VecCost +=
  1487. TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
  1488. return VecCost - ScalarCost;
  1489. }
  1490. default:
  1491. llvm_unreachable("Unknown instruction");
  1492. }
  1493. }
  1494. bool BoUpSLP::isFullyVectorizableTinyTree() {
  1495. DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
  1496. VectorizableTree.size() << " is fully vectorizable .\n");
  1497. // We only handle trees of height 2.
  1498. if (VectorizableTree.size() != 2)
  1499. return false;
  1500. // Handle splat and all-constants stores.
  1501. if (!VectorizableTree[0].NeedToGather &&
  1502. (allConstant(VectorizableTree[1].Scalars) ||
  1503. isSplat(VectorizableTree[1].Scalars)))
  1504. return true;
  1505. // Gathering cost would be too much for tiny trees.
  1506. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
  1507. return false;
  1508. return true;
  1509. }
  1510. int BoUpSLP::getSpillCost() {
  1511. // Walk from the bottom of the tree to the top, tracking which values are
  1512. // live. When we see a call instruction that is not part of our tree,
  1513. // query TTI to see if there is a cost to keeping values live over it
  1514. // (for example, if spills and fills are required).
  1515. unsigned BundleWidth = VectorizableTree.front().Scalars.size();
  1516. int Cost = 0;
  1517. SmallPtrSet<Instruction*, 4> LiveValues;
  1518. Instruction *PrevInst = nullptr;
  1519. for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
  1520. Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
  1521. if (!Inst)
  1522. continue;
  1523. if (!PrevInst) {
  1524. PrevInst = Inst;
  1525. continue;
  1526. }
  1527. // Update LiveValues.
  1528. LiveValues.erase(PrevInst);
  1529. for (auto &J : PrevInst->operands()) {
  1530. if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
  1531. LiveValues.insert(cast<Instruction>(&*J));
  1532. }
  1533. DEBUG(
  1534. dbgs() << "SLP: #LV: " << LiveValues.size();
  1535. for (auto *X : LiveValues)
  1536. dbgs() << " " << X->getName();
  1537. dbgs() << ", Looking at ";
  1538. Inst->dump();
  1539. );
  1540. // Now find the sequence of instructions between PrevInst and Inst.
  1541. BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
  1542. PrevInstIt(PrevInst->getIterator());
  1543. --PrevInstIt;
  1544. while (InstIt != PrevInstIt) {
  1545. if (PrevInstIt == PrevInst->getParent()->rend()) {
  1546. PrevInstIt = Inst->getParent()->rbegin();
  1547. continue;
  1548. }
  1549. if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
  1550. SmallVector<Type*, 4> V;
  1551. for (auto *II : LiveValues)
  1552. V.push_back(VectorType::get(II->getType(), BundleWidth));
  1553. Cost += TTI->getCostOfKeepingLiveOverCall(V);
  1554. }
  1555. ++PrevInstIt;
  1556. }
  1557. PrevInst = Inst;
  1558. }
  1559. return Cost;
  1560. }
  1561. int BoUpSLP::getTreeCost() {
  1562. int Cost = 0;
  1563. DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
  1564. VectorizableTree.size() << ".\n");
  1565. // We only vectorize tiny trees if it is fully vectorizable.
  1566. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
  1567. if (VectorizableTree.empty()) {
  1568. assert(!ExternalUses.size() && "We should not have any external users");
  1569. }
  1570. return INT_MAX;
  1571. }
  1572. unsigned BundleWidth = VectorizableTree[0].Scalars.size();
  1573. for (TreeEntry &TE : VectorizableTree) {
  1574. int C = getEntryCost(&TE);
  1575. DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
  1576. << *TE.Scalars[0] << ".\n");
  1577. Cost += C;
  1578. }
  1579. SmallSet<Value *, 16> ExtractCostCalculated;
  1580. int ExtractCost = 0;
  1581. for (ExternalUser &EU : ExternalUses) {
  1582. // We only add extract cost once for the same scalar.
  1583. if (!ExtractCostCalculated.insert(EU.Scalar).second)
  1584. continue;
  1585. // Uses by ephemeral values are free (because the ephemeral value will be
  1586. // removed prior to code generation, and so the extraction will be
  1587. // removed as well).
  1588. if (EphValues.count(EU.User))
  1589. continue;
  1590. // If we plan to rewrite the tree in a smaller type, we will need to sign
  1591. // extend the extracted value back to the original type. Here, we account
  1592. // for the extract and the added cost of the sign extend if needed.
  1593. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
  1594. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  1595. if (MinBWs.count(ScalarRoot)) {
  1596. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
  1597. VecTy = VectorType::get(MinTy, BundleWidth);
  1598. ExtractCost +=
  1599. TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy);
  1600. }
  1601. ExtractCost +=
  1602. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
  1603. }
  1604. int SpillCost = getSpillCost();
  1605. Cost += SpillCost + ExtractCost;
  1606. DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
  1607. << "SLP: Extract Cost = " << ExtractCost << ".\n"
  1608. << "SLP: Total Cost = " << Cost << ".\n");
  1609. return Cost;
  1610. }
  1611. int BoUpSLP::getGatherCost(Type *Ty) {
  1612. int Cost = 0;
  1613. for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
  1614. Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  1615. return Cost;
  1616. }
  1617. int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
  1618. // Find the type of the operands in VL.
  1619. Type *ScalarTy = VL[0]->getType();
  1620. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1621. ScalarTy = SI->getValueOperand()->getType();
  1622. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1623. // Find the cost of inserting/extracting values from the vector.
  1624. return getGatherCost(VecTy);
  1625. }
  1626. // Reorder commutative operations in alternate shuffle if the resulting vectors
  1627. // are consecutive loads. This would allow us to vectorize the tree.
  1628. // If we have something like-
  1629. // load a[0] - load b[0]
  1630. // load b[1] + load a[1]
  1631. // load a[2] - load b[2]
  1632. // load a[3] + load b[3]
  1633. // Reordering the second load b[1] load a[1] would allow us to vectorize this
  1634. // code.
  1635. void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
  1636. SmallVectorImpl<Value *> &Left,
  1637. SmallVectorImpl<Value *> &Right) {
  1638. // Push left and right operands of binary operation into Left and Right
  1639. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1640. Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
  1641. Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
  1642. }
  1643. // Reorder if we have a commutative operation and consecutive access
  1644. // are on either side of the alternate instructions.
  1645. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  1646. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  1647. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  1648. Instruction *VL1 = cast<Instruction>(VL[j]);
  1649. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1650. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  1651. std::swap(Left[j], Right[j]);
  1652. continue;
  1653. } else if (VL2->isCommutative() &&
  1654. isConsecutiveAccess(L, L1, *DL, *SE)) {
  1655. std::swap(Left[j + 1], Right[j + 1]);
  1656. continue;
  1657. }
  1658. // else unchanged
  1659. }
  1660. }
  1661. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  1662. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  1663. Instruction *VL1 = cast<Instruction>(VL[j]);
  1664. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1665. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  1666. std::swap(Left[j], Right[j]);
  1667. continue;
  1668. } else if (VL2->isCommutative() &&
  1669. isConsecutiveAccess(L, L1, *DL, *SE)) {
  1670. std::swap(Left[j + 1], Right[j + 1]);
  1671. continue;
  1672. }
  1673. // else unchanged
  1674. }
  1675. }
  1676. }
  1677. }
  1678. // Return true if I should be commuted before adding it's left and right
  1679. // operands to the arrays Left and Right.
  1680. //
  1681. // The vectorizer is trying to either have all elements one side being
  1682. // instruction with the same opcode to enable further vectorization, or having
  1683. // a splat to lower the vectorizing cost.
  1684. static bool shouldReorderOperands(int i, Instruction &I,
  1685. SmallVectorImpl<Value *> &Left,
  1686. SmallVectorImpl<Value *> &Right,
  1687. bool AllSameOpcodeLeft,
  1688. bool AllSameOpcodeRight, bool SplatLeft,
  1689. bool SplatRight) {
  1690. Value *VLeft = I.getOperand(0);
  1691. Value *VRight = I.getOperand(1);
  1692. // If we have "SplatRight", try to see if commuting is needed to preserve it.
  1693. if (SplatRight) {
  1694. if (VRight == Right[i - 1])
  1695. // Preserve SplatRight
  1696. return false;
  1697. if (VLeft == Right[i - 1]) {
  1698. // Commuting would preserve SplatRight, but we don't want to break
  1699. // SplatLeft either, i.e. preserve the original order if possible.
  1700. // (FIXME: why do we care?)
  1701. if (SplatLeft && VLeft == Left[i - 1])
  1702. return false;
  1703. return true;
  1704. }
  1705. }
  1706. // Symmetrically handle Right side.
  1707. if (SplatLeft) {
  1708. if (VLeft == Left[i - 1])
  1709. // Preserve SplatLeft
  1710. return false;
  1711. if (VRight == Left[i - 1])
  1712. return true;
  1713. }
  1714. Instruction *ILeft = dyn_cast<Instruction>(VLeft);
  1715. Instruction *IRight = dyn_cast<Instruction>(VRight);
  1716. // If we have "AllSameOpcodeRight", try to see if the left operands preserves
  1717. // it and not the right, in this case we want to commute.
  1718. if (AllSameOpcodeRight) {
  1719. unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
  1720. if (IRight && RightPrevOpcode == IRight->getOpcode())
  1721. // Do not commute, a match on the right preserves AllSameOpcodeRight
  1722. return false;
  1723. if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
  1724. // We have a match and may want to commute, but first check if there is
  1725. // not also a match on the existing operands on the Left to preserve
  1726. // AllSameOpcodeLeft, i.e. preserve the original order if possible.
  1727. // (FIXME: why do we care?)
  1728. if (AllSameOpcodeLeft && ILeft &&
  1729. cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
  1730. return false;
  1731. return true;
  1732. }
  1733. }
  1734. // Symmetrically handle Left side.
  1735. if (AllSameOpcodeLeft) {
  1736. unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
  1737. if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
  1738. return false;
  1739. if (IRight && LeftPrevOpcode == IRight->getOpcode())
  1740. return true;
  1741. }
  1742. return false;
  1743. }
  1744. void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  1745. SmallVectorImpl<Value *> &Left,
  1746. SmallVectorImpl<Value *> &Right) {
  1747. if (VL.size()) {
  1748. // Peel the first iteration out of the loop since there's nothing
  1749. // interesting to do anyway and it simplifies the checks in the loop.
  1750. auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
  1751. auto VRight = cast<Instruction>(VL[0])->getOperand(1);
  1752. if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
  1753. // Favor having instruction to the right. FIXME: why?
  1754. std::swap(VLeft, VRight);
  1755. Left.push_back(VLeft);
  1756. Right.push_back(VRight);
  1757. }
  1758. // Keep track if we have instructions with all the same opcode on one side.
  1759. bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
  1760. bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
  1761. // Keep track if we have one side with all the same value (broadcast).
  1762. bool SplatLeft = true;
  1763. bool SplatRight = true;
  1764. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1765. Instruction *I = cast<Instruction>(VL[i]);
  1766. assert(I->isCommutative() && "Can only process commutative instruction");
  1767. // Commute to favor either a splat or maximizing having the same opcodes on
  1768. // one side.
  1769. if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
  1770. AllSameOpcodeRight, SplatLeft, SplatRight)) {
  1771. Left.push_back(I->getOperand(1));
  1772. Right.push_back(I->getOperand(0));
  1773. } else {
  1774. Left.push_back(I->getOperand(0));
  1775. Right.push_back(I->getOperand(1));
  1776. }
  1777. // Update Splat* and AllSameOpcode* after the insertion.
  1778. SplatRight = SplatRight && (Right[i - 1] == Right[i]);
  1779. SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
  1780. AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
  1781. (cast<Instruction>(Left[i - 1])->getOpcode() ==
  1782. cast<Instruction>(Left[i])->getOpcode());
  1783. AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
  1784. (cast<Instruction>(Right[i - 1])->getOpcode() ==
  1785. cast<Instruction>(Right[i])->getOpcode());
  1786. }
  1787. // If one operand end up being broadcast, return this operand order.
  1788. if (SplatRight || SplatLeft)
  1789. return;
  1790. // Finally check if we can get longer vectorizable chain by reordering
  1791. // without breaking the good operand order detected above.
  1792. // E.g. If we have something like-
  1793. // load a[0] load b[0]
  1794. // load b[1] load a[1]
  1795. // load a[2] load b[2]
  1796. // load a[3] load b[3]
  1797. // Reordering the second load b[1] load a[1] would allow us to vectorize
  1798. // this code and we still retain AllSameOpcode property.
  1799. // FIXME: This load reordering might break AllSameOpcode in some rare cases
  1800. // such as-
  1801. // add a[0],c[0] load b[0]
  1802. // add a[1],c[2] load b[1]
  1803. // b[2] load b[2]
  1804. // add a[3],c[3] load b[3]
  1805. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  1806. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  1807. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  1808. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  1809. std::swap(Left[j + 1], Right[j + 1]);
  1810. continue;
  1811. }
  1812. }
  1813. }
  1814. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  1815. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  1816. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  1817. std::swap(Left[j + 1], Right[j + 1]);
  1818. continue;
  1819. }
  1820. }
  1821. }
  1822. // else unchanged
  1823. }
  1824. }
  1825. void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
  1826. Instruction *VL0 = cast<Instruction>(VL[0]);
  1827. BasicBlock::iterator NextInst(VL0);
  1828. ++NextInst;
  1829. Builder.SetInsertPoint(VL0->getParent(), NextInst);
  1830. Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
  1831. }
  1832. Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
  1833. Value *Vec = UndefValue::get(Ty);
  1834. // Generate the 'InsertElement' instruction.
  1835. for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
  1836. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
  1837. if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
  1838. GatherSeq.insert(Insrt);
  1839. CSEBlocks.insert(Insrt->getParent());
  1840. // Add to our 'need-to-extract' list.
  1841. if (ScalarToTreeEntry.count(VL[i])) {
  1842. int Idx = ScalarToTreeEntry[VL[i]];
  1843. TreeEntry *E = &VectorizableTree[Idx];
  1844. // Find which lane we need to extract.
  1845. int FoundLane = -1;
  1846. for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
  1847. // Is this the lane of the scalar that we are looking for ?
  1848. if (E->Scalars[Lane] == VL[i]) {
  1849. FoundLane = Lane;
  1850. break;
  1851. }
  1852. }
  1853. assert(FoundLane >= 0 && "Could not find the correct lane");
  1854. ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
  1855. }
  1856. }
  1857. }
  1858. return Vec;
  1859. }
  1860. Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
  1861. SmallDenseMap<Value*, int>::const_iterator Entry
  1862. = ScalarToTreeEntry.find(VL[0]);
  1863. if (Entry != ScalarToTreeEntry.end()) {
  1864. int Idx = Entry->second;
  1865. const TreeEntry *En = &VectorizableTree[Idx];
  1866. if (En->isSame(VL) && En->VectorizedValue)
  1867. return En->VectorizedValue;
  1868. }
  1869. return nullptr;
  1870. }
  1871. Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  1872. if (ScalarToTreeEntry.count(VL[0])) {
  1873. int Idx = ScalarToTreeEntry[VL[0]];
  1874. TreeEntry *E = &VectorizableTree[Idx];
  1875. if (E->isSame(VL))
  1876. return vectorizeTree(E);
  1877. }
  1878. Type *ScalarTy = VL[0]->getType();
  1879. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1880. ScalarTy = SI->getValueOperand()->getType();
  1881. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1882. return Gather(VL, VecTy);
  1883. }
  1884. Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
  1885. IRBuilder<>::InsertPointGuard Guard(Builder);
  1886. if (E->VectorizedValue) {
  1887. DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
  1888. return E->VectorizedValue;
  1889. }
  1890. Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
  1891. Type *ScalarTy = VL0->getType();
  1892. if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
  1893. ScalarTy = SI->getValueOperand()->getType();
  1894. VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
  1895. if (E->NeedToGather) {
  1896. setInsertPointAfterBundle(E->Scalars);
  1897. return Gather(E->Scalars, VecTy);
  1898. }
  1899. unsigned Opcode = getSameOpcode(E->Scalars);
  1900. switch (Opcode) {
  1901. case Instruction::PHI: {
  1902. PHINode *PH = dyn_cast<PHINode>(VL0);
  1903. Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
  1904. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1905. PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
  1906. E->VectorizedValue = NewPhi;
  1907. // PHINodes may have multiple entries from the same block. We want to
  1908. // visit every block once.
  1909. SmallSet<BasicBlock*, 4> VisitedBBs;
  1910. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1911. ValueList Operands;
  1912. BasicBlock *IBB = PH->getIncomingBlock(i);
  1913. if (!VisitedBBs.insert(IBB).second) {
  1914. NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
  1915. continue;
  1916. }
  1917. // Prepare the operand vector.
  1918. for (Value *V : E->Scalars)
  1919. Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
  1920. Builder.SetInsertPoint(IBB->getTerminator());
  1921. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1922. Value *Vec = vectorizeTree(Operands);
  1923. NewPhi->addIncoming(Vec, IBB);
  1924. }
  1925. assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
  1926. "Invalid number of incoming values");
  1927. return NewPhi;
  1928. }
  1929. case Instruction::ExtractElement: {
  1930. if (CanReuseExtract(E->Scalars)) {
  1931. Value *V = VL0->getOperand(0);
  1932. E->VectorizedValue = V;
  1933. return V;
  1934. }
  1935. return Gather(E->Scalars, VecTy);
  1936. }
  1937. case Instruction::ZExt:
  1938. case Instruction::SExt:
  1939. case Instruction::FPToUI:
  1940. case Instruction::FPToSI:
  1941. case Instruction::FPExt:
  1942. case Instruction::PtrToInt:
  1943. case Instruction::IntToPtr:
  1944. case Instruction::SIToFP:
  1945. case Instruction::UIToFP:
  1946. case Instruction::Trunc:
  1947. case Instruction::FPTrunc:
  1948. case Instruction::BitCast: {
  1949. ValueList INVL;
  1950. for (Value *V : E->Scalars)
  1951. INVL.push_back(cast<Instruction>(V)->getOperand(0));
  1952. setInsertPointAfterBundle(E->Scalars);
  1953. Value *InVec = vectorizeTree(INVL);
  1954. if (Value *V = alreadyVectorized(E->Scalars))
  1955. return V;
  1956. CastInst *CI = dyn_cast<CastInst>(VL0);
  1957. Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
  1958. E->VectorizedValue = V;
  1959. ++NumVectorInstructions;
  1960. return V;
  1961. }
  1962. case Instruction::FCmp:
  1963. case Instruction::ICmp: {
  1964. ValueList LHSV, RHSV;
  1965. for (Value *V : E->Scalars) {
  1966. LHSV.push_back(cast<Instruction>(V)->getOperand(0));
  1967. RHSV.push_back(cast<Instruction>(V)->getOperand(1));
  1968. }
  1969. setInsertPointAfterBundle(E->Scalars);
  1970. Value *L = vectorizeTree(LHSV);
  1971. Value *R = vectorizeTree(RHSV);
  1972. if (Value *V = alreadyVectorized(E->Scalars))
  1973. return V;
  1974. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1975. Value *V;
  1976. if (Opcode == Instruction::FCmp)
  1977. V = Builder.CreateFCmp(P0, L, R);
  1978. else
  1979. V = Builder.CreateICmp(P0, L, R);
  1980. E->VectorizedValue = V;
  1981. ++NumVectorInstructions;
  1982. return V;
  1983. }
  1984. case Instruction::Select: {
  1985. ValueList TrueVec, FalseVec, CondVec;
  1986. for (Value *V : E->Scalars) {
  1987. CondVec.push_back(cast<Instruction>(V)->getOperand(0));
  1988. TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
  1989. FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
  1990. }
  1991. setInsertPointAfterBundle(E->Scalars);
  1992. Value *Cond = vectorizeTree(CondVec);
  1993. Value *True = vectorizeTree(TrueVec);
  1994. Value *False = vectorizeTree(FalseVec);
  1995. if (Value *V = alreadyVectorized(E->Scalars))
  1996. return V;
  1997. Value *V = Builder.CreateSelect(Cond, True, False);
  1998. E->VectorizedValue = V;
  1999. ++NumVectorInstructions;
  2000. return V;
  2001. }
  2002. case Instruction::Add:
  2003. case Instruction::FAdd:
  2004. case Instruction::Sub:
  2005. case Instruction::FSub:
  2006. case Instruction::Mul:
  2007. case Instruction::FMul:
  2008. case Instruction::UDiv:
  2009. case Instruction::SDiv:
  2010. case Instruction::FDiv:
  2011. case Instruction::URem:
  2012. case Instruction::SRem:
  2013. case Instruction::FRem:
  2014. case Instruction::Shl:
  2015. case Instruction::LShr:
  2016. case Instruction::AShr:
  2017. case Instruction::And:
  2018. case Instruction::Or:
  2019. case Instruction::Xor: {
  2020. ValueList LHSVL, RHSVL;
  2021. if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
  2022. reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
  2023. else
  2024. for (Value *V : E->Scalars) {
  2025. LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
  2026. RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
  2027. }
  2028. setInsertPointAfterBundle(E->Scalars);
  2029. Value *LHS = vectorizeTree(LHSVL);
  2030. Value *RHS = vectorizeTree(RHSVL);
  2031. if (LHS == RHS && isa<Instruction>(LHS)) {
  2032. assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
  2033. }
  2034. if (Value *V = alreadyVectorized(E->Scalars))
  2035. return V;
  2036. BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
  2037. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
  2038. E->VectorizedValue = V;
  2039. propagateIRFlags(E->VectorizedValue, E->Scalars);
  2040. ++NumVectorInstructions;
  2041. if (Instruction *I = dyn_cast<Instruction>(V))
  2042. return propagateMetadata(I, E->Scalars);
  2043. return V;
  2044. }
  2045. case Instruction::Load: {
  2046. // Loads are inserted at the head of the tree because we don't want to
  2047. // sink them all the way down past store instructions.
  2048. setInsertPointAfterBundle(E->Scalars);
  2049. LoadInst *LI = cast<LoadInst>(VL0);
  2050. Type *ScalarLoadTy = LI->getType();
  2051. unsigned AS = LI->getPointerAddressSpace();
  2052. Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
  2053. VecTy->getPointerTo(AS));
  2054. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2055. // ExternalUses list to make sure that an extract will be generated in the
  2056. // future.
  2057. if (ScalarToTreeEntry.count(LI->getPointerOperand()))
  2058. ExternalUses.push_back(
  2059. ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
  2060. unsigned Alignment = LI->getAlignment();
  2061. LI = Builder.CreateLoad(VecPtr);
  2062. if (!Alignment) {
  2063. Alignment = DL->getABITypeAlignment(ScalarLoadTy);
  2064. }
  2065. LI->setAlignment(Alignment);
  2066. E->VectorizedValue = LI;
  2067. ++NumVectorInstructions;
  2068. return propagateMetadata(LI, E->Scalars);
  2069. }
  2070. case Instruction::Store: {
  2071. StoreInst *SI = cast<StoreInst>(VL0);
  2072. unsigned Alignment = SI->getAlignment();
  2073. unsigned AS = SI->getPointerAddressSpace();
  2074. ValueList ValueOp;
  2075. for (Value *V : E->Scalars)
  2076. ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
  2077. setInsertPointAfterBundle(E->Scalars);
  2078. Value *VecValue = vectorizeTree(ValueOp);
  2079. Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
  2080. VecTy->getPointerTo(AS));
  2081. StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
  2082. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2083. // ExternalUses list to make sure that an extract will be generated in the
  2084. // future.
  2085. if (ScalarToTreeEntry.count(SI->getPointerOperand()))
  2086. ExternalUses.push_back(
  2087. ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
  2088. if (!Alignment) {
  2089. Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
  2090. }
  2091. S->setAlignment(Alignment);
  2092. E->VectorizedValue = S;
  2093. ++NumVectorInstructions;
  2094. return propagateMetadata(S, E->Scalars);
  2095. }
  2096. case Instruction::GetElementPtr: {
  2097. setInsertPointAfterBundle(E->Scalars);
  2098. ValueList Op0VL;
  2099. for (Value *V : E->Scalars)
  2100. Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
  2101. Value *Op0 = vectorizeTree(Op0VL);
  2102. std::vector<Value *> OpVecs;
  2103. for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
  2104. ++j) {
  2105. ValueList OpVL;
  2106. for (Value *V : E->Scalars)
  2107. OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
  2108. Value *OpVec = vectorizeTree(OpVL);
  2109. OpVecs.push_back(OpVec);
  2110. }
  2111. Value *V = Builder.CreateGEP(
  2112. cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
  2113. E->VectorizedValue = V;
  2114. ++NumVectorInstructions;
  2115. if (Instruction *I = dyn_cast<Instruction>(V))
  2116. return propagateMetadata(I, E->Scalars);
  2117. return V;
  2118. }
  2119. case Instruction::Call: {
  2120. CallInst *CI = cast<CallInst>(VL0);
  2121. setInsertPointAfterBundle(E->Scalars);
  2122. Function *FI;
  2123. Intrinsic::ID IID = Intrinsic::not_intrinsic;
  2124. Value *ScalarArg = nullptr;
  2125. if (CI && (FI = CI->getCalledFunction())) {
  2126. IID = FI->getIntrinsicID();
  2127. }
  2128. std::vector<Value *> OpVecs;
  2129. for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
  2130. ValueList OpVL;
  2131. // ctlz,cttz and powi are special intrinsics whose second argument is
  2132. // a scalar. This argument should not be vectorized.
  2133. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
  2134. CallInst *CEI = cast<CallInst>(E->Scalars[0]);
  2135. ScalarArg = CEI->getArgOperand(j);
  2136. OpVecs.push_back(CEI->getArgOperand(j));
  2137. continue;
  2138. }
  2139. for (Value *V : E->Scalars) {
  2140. CallInst *CEI = cast<CallInst>(V);
  2141. OpVL.push_back(CEI->getArgOperand(j));
  2142. }
  2143. Value *OpVec = vectorizeTree(OpVL);
  2144. DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
  2145. OpVecs.push_back(OpVec);
  2146. }
  2147. Module *M = F->getParent();
  2148. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  2149. Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
  2150. Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
  2151. Value *V = Builder.CreateCall(CF, OpVecs);
  2152. // The scalar argument uses an in-tree scalar so we add the new vectorized
  2153. // call to ExternalUses list to make sure that an extract will be
  2154. // generated in the future.
  2155. if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
  2156. ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
  2157. E->VectorizedValue = V;
  2158. ++NumVectorInstructions;
  2159. return V;
  2160. }
  2161. case Instruction::ShuffleVector: {
  2162. ValueList LHSVL, RHSVL;
  2163. assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
  2164. reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
  2165. setInsertPointAfterBundle(E->Scalars);
  2166. Value *LHS = vectorizeTree(LHSVL);
  2167. Value *RHS = vectorizeTree(RHSVL);
  2168. if (Value *V = alreadyVectorized(E->Scalars))
  2169. return V;
  2170. // Create a vector of LHS op1 RHS
  2171. BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
  2172. Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
  2173. // Create a vector of LHS op2 RHS
  2174. Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
  2175. BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
  2176. Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
  2177. // Create shuffle to take alternate operations from the vector.
  2178. // Also, gather up odd and even scalar ops to propagate IR flags to
  2179. // each vector operation.
  2180. ValueList OddScalars, EvenScalars;
  2181. unsigned e = E->Scalars.size();
  2182. SmallVector<Constant *, 8> Mask(e);
  2183. for (unsigned i = 0; i < e; ++i) {
  2184. if (i & 1) {
  2185. Mask[i] = Builder.getInt32(e + i);
  2186. OddScalars.push_back(E->Scalars[i]);
  2187. } else {
  2188. Mask[i] = Builder.getInt32(i);
  2189. EvenScalars.push_back(E->Scalars[i]);
  2190. }
  2191. }
  2192. Value *ShuffleMask = ConstantVector::get(Mask);
  2193. propagateIRFlags(V0, EvenScalars);
  2194. propagateIRFlags(V1, OddScalars);
  2195. Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
  2196. E->VectorizedValue = V;
  2197. ++NumVectorInstructions;
  2198. if (Instruction *I = dyn_cast<Instruction>(V))
  2199. return propagateMetadata(I, E->Scalars);
  2200. return V;
  2201. }
  2202. default:
  2203. llvm_unreachable("unknown inst");
  2204. }
  2205. return nullptr;
  2206. }
  2207. Value *BoUpSLP::vectorizeTree() {
  2208. // All blocks must be scheduled before any instructions are inserted.
  2209. for (auto &BSIter : BlocksSchedules) {
  2210. scheduleBlock(BSIter.second.get());
  2211. }
  2212. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2213. auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
  2214. // If the vectorized tree can be rewritten in a smaller type, we truncate the
  2215. // vectorized root. InstCombine will then rewrite the entire expression. We
  2216. // sign extend the extracted values below.
  2217. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  2218. if (MinBWs.count(ScalarRoot)) {
  2219. if (auto *I = dyn_cast<Instruction>(VectorRoot))
  2220. Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
  2221. auto BundleWidth = VectorizableTree[0].Scalars.size();
  2222. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
  2223. auto *VecTy = VectorType::get(MinTy, BundleWidth);
  2224. auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
  2225. VectorizableTree[0].VectorizedValue = Trunc;
  2226. }
  2227. DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
  2228. // Extract all of the elements with the external uses.
  2229. for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
  2230. it != e; ++it) {
  2231. Value *Scalar = it->Scalar;
  2232. llvm::User *User = it->User;
  2233. // Skip users that we already RAUW. This happens when one instruction
  2234. // has multiple uses of the same value.
  2235. if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
  2236. Scalar->user_end())
  2237. continue;
  2238. assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
  2239. int Idx = ScalarToTreeEntry[Scalar];
  2240. TreeEntry *E = &VectorizableTree[Idx];
  2241. assert(!E->NeedToGather && "Extracting from a gather list");
  2242. Value *Vec = E->VectorizedValue;
  2243. assert(Vec && "Can't find vectorizable value");
  2244. Value *Lane = Builder.getInt32(it->Lane);
  2245. // Generate extracts for out-of-tree users.
  2246. // Find the insertion point for the extractelement lane.
  2247. if (auto *VecI = dyn_cast<Instruction>(Vec)) {
  2248. if (PHINode *PH = dyn_cast<PHINode>(User)) {
  2249. for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
  2250. if (PH->getIncomingValue(i) == Scalar) {
  2251. TerminatorInst *IncomingTerminator =
  2252. PH->getIncomingBlock(i)->getTerminator();
  2253. if (isa<CatchSwitchInst>(IncomingTerminator)) {
  2254. Builder.SetInsertPoint(VecI->getParent(),
  2255. std::next(VecI->getIterator()));
  2256. } else {
  2257. Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
  2258. }
  2259. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2260. if (MinBWs.count(ScalarRoot))
  2261. Ex = Builder.CreateSExt(Ex, Scalar->getType());
  2262. CSEBlocks.insert(PH->getIncomingBlock(i));
  2263. PH->setOperand(i, Ex);
  2264. }
  2265. }
  2266. } else {
  2267. Builder.SetInsertPoint(cast<Instruction>(User));
  2268. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2269. if (MinBWs.count(ScalarRoot))
  2270. Ex = Builder.CreateSExt(Ex, Scalar->getType());
  2271. CSEBlocks.insert(cast<Instruction>(User)->getParent());
  2272. User->replaceUsesOfWith(Scalar, Ex);
  2273. }
  2274. } else {
  2275. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2276. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2277. if (MinBWs.count(ScalarRoot))
  2278. Ex = Builder.CreateSExt(Ex, Scalar->getType());
  2279. CSEBlocks.insert(&F->getEntryBlock());
  2280. User->replaceUsesOfWith(Scalar, Ex);
  2281. }
  2282. DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
  2283. }
  2284. // For each vectorized value:
  2285. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  2286. TreeEntry *Entry = &VectorizableTree[EIdx];
  2287. // For each lane:
  2288. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  2289. Value *Scalar = Entry->Scalars[Lane];
  2290. // No need to handle users of gathered values.
  2291. if (Entry->NeedToGather)
  2292. continue;
  2293. assert(Entry->VectorizedValue && "Can't find vectorizable value");
  2294. Type *Ty = Scalar->getType();
  2295. if (!Ty->isVoidTy()) {
  2296. #ifndef NDEBUG
  2297. for (User *U : Scalar->users()) {
  2298. DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
  2299. assert((ScalarToTreeEntry.count(U) ||
  2300. // It is legal to replace users in the ignorelist by undef.
  2301. (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
  2302. UserIgnoreList.end())) &&
  2303. "Replacing out-of-tree value with undef");
  2304. }
  2305. #endif
  2306. Value *Undef = UndefValue::get(Ty);
  2307. Scalar->replaceAllUsesWith(Undef);
  2308. }
  2309. DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
  2310. eraseInstruction(cast<Instruction>(Scalar));
  2311. }
  2312. }
  2313. Builder.ClearInsertionPoint();
  2314. return VectorizableTree[0].VectorizedValue;
  2315. }
  2316. void BoUpSLP::optimizeGatherSequence() {
  2317. DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
  2318. << " gather sequences instructions.\n");
  2319. // LICM InsertElementInst sequences.
  2320. for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
  2321. e = GatherSeq.end(); it != e; ++it) {
  2322. InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
  2323. if (!Insert)
  2324. continue;
  2325. // Check if this block is inside a loop.
  2326. Loop *L = LI->getLoopFor(Insert->getParent());
  2327. if (!L)
  2328. continue;
  2329. // Check if it has a preheader.
  2330. BasicBlock *PreHeader = L->getLoopPreheader();
  2331. if (!PreHeader)
  2332. continue;
  2333. // If the vector or the element that we insert into it are
  2334. // instructions that are defined in this basic block then we can't
  2335. // hoist this instruction.
  2336. Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
  2337. Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
  2338. if (CurrVec && L->contains(CurrVec))
  2339. continue;
  2340. if (NewElem && L->contains(NewElem))
  2341. continue;
  2342. // We can hoist this instruction. Move it to the pre-header.
  2343. Insert->moveBefore(PreHeader->getTerminator());
  2344. }
  2345. // Make a list of all reachable blocks in our CSE queue.
  2346. SmallVector<const DomTreeNode *, 8> CSEWorkList;
  2347. CSEWorkList.reserve(CSEBlocks.size());
  2348. for (BasicBlock *BB : CSEBlocks)
  2349. if (DomTreeNode *N = DT->getNode(BB)) {
  2350. assert(DT->isReachableFromEntry(N));
  2351. CSEWorkList.push_back(N);
  2352. }
  2353. // Sort blocks by domination. This ensures we visit a block after all blocks
  2354. // dominating it are visited.
  2355. std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
  2356. [this](const DomTreeNode *A, const DomTreeNode *B) {
  2357. return DT->properlyDominates(A, B);
  2358. });
  2359. // Perform O(N^2) search over the gather sequences and merge identical
  2360. // instructions. TODO: We can further optimize this scan if we split the
  2361. // instructions into different buckets based on the insert lane.
  2362. SmallVector<Instruction *, 16> Visited;
  2363. for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
  2364. assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
  2365. "Worklist not sorted properly!");
  2366. BasicBlock *BB = (*I)->getBlock();
  2367. // For all instructions in blocks containing gather sequences:
  2368. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
  2369. Instruction *In = &*it++;
  2370. if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
  2371. continue;
  2372. // Check if we can replace this instruction with any of the
  2373. // visited instructions.
  2374. for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
  2375. ve = Visited.end();
  2376. v != ve; ++v) {
  2377. if (In->isIdenticalTo(*v) &&
  2378. DT->dominates((*v)->getParent(), In->getParent())) {
  2379. In->replaceAllUsesWith(*v);
  2380. eraseInstruction(In);
  2381. In = nullptr;
  2382. break;
  2383. }
  2384. }
  2385. if (In) {
  2386. assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
  2387. Visited.push_back(In);
  2388. }
  2389. }
  2390. }
  2391. CSEBlocks.clear();
  2392. GatherSeq.clear();
  2393. }
  2394. // Groups the instructions to a bundle (which is then a single scheduling entity)
  2395. // and schedules instructions until the bundle gets ready.
  2396. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
  2397. BoUpSLP *SLP) {
  2398. if (isa<PHINode>(VL[0]))
  2399. return true;
  2400. // Initialize the instruction bundle.
  2401. Instruction *OldScheduleEnd = ScheduleEnd;
  2402. ScheduleData *PrevInBundle = nullptr;
  2403. ScheduleData *Bundle = nullptr;
  2404. bool ReSchedule = false;
  2405. DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
  2406. // Make sure that the scheduling region contains all
  2407. // instructions of the bundle.
  2408. for (Value *V : VL) {
  2409. if (!extendSchedulingRegion(V))
  2410. return false;
  2411. }
  2412. for (Value *V : VL) {
  2413. ScheduleData *BundleMember = getScheduleData(V);
  2414. assert(BundleMember &&
  2415. "no ScheduleData for bundle member (maybe not in same basic block)");
  2416. if (BundleMember->IsScheduled) {
  2417. // A bundle member was scheduled as single instruction before and now
  2418. // needs to be scheduled as part of the bundle. We just get rid of the
  2419. // existing schedule.
  2420. DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
  2421. << " was already scheduled\n");
  2422. ReSchedule = true;
  2423. }
  2424. assert(BundleMember->isSchedulingEntity() &&
  2425. "bundle member already part of other bundle");
  2426. if (PrevInBundle) {
  2427. PrevInBundle->NextInBundle = BundleMember;
  2428. } else {
  2429. Bundle = BundleMember;
  2430. }
  2431. BundleMember->UnscheduledDepsInBundle = 0;
  2432. Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
  2433. // Group the instructions to a bundle.
  2434. BundleMember->FirstInBundle = Bundle;
  2435. PrevInBundle = BundleMember;
  2436. }
  2437. if (ScheduleEnd != OldScheduleEnd) {
  2438. // The scheduling region got new instructions at the lower end (or it is a
  2439. // new region for the first bundle). This makes it necessary to
  2440. // recalculate all dependencies.
  2441. // It is seldom that this needs to be done a second time after adding the
  2442. // initial bundle to the region.
  2443. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  2444. ScheduleData *SD = getScheduleData(I);
  2445. SD->clearDependencies();
  2446. }
  2447. ReSchedule = true;
  2448. }
  2449. if (ReSchedule) {
  2450. resetSchedule();
  2451. initialFillReadyList(ReadyInsts);
  2452. }
  2453. DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
  2454. << BB->getName() << "\n");
  2455. calculateDependencies(Bundle, true, SLP);
  2456. // Now try to schedule the new bundle. As soon as the bundle is "ready" it
  2457. // means that there are no cyclic dependencies and we can schedule it.
  2458. // Note that's important that we don't "schedule" the bundle yet (see
  2459. // cancelScheduling).
  2460. while (!Bundle->isReady() && !ReadyInsts.empty()) {
  2461. ScheduleData *pickedSD = ReadyInsts.back();
  2462. ReadyInsts.pop_back();
  2463. if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
  2464. schedule(pickedSD, ReadyInsts);
  2465. }
  2466. }
  2467. if (!Bundle->isReady()) {
  2468. cancelScheduling(VL);
  2469. return false;
  2470. }
  2471. return true;
  2472. }
  2473. void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
  2474. if (isa<PHINode>(VL[0]))
  2475. return;
  2476. ScheduleData *Bundle = getScheduleData(VL[0]);
  2477. DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
  2478. assert(!Bundle->IsScheduled &&
  2479. "Can't cancel bundle which is already scheduled");
  2480. assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
  2481. "tried to unbundle something which is not a bundle");
  2482. // Un-bundle: make single instructions out of the bundle.
  2483. ScheduleData *BundleMember = Bundle;
  2484. while (BundleMember) {
  2485. assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
  2486. BundleMember->FirstInBundle = BundleMember;
  2487. ScheduleData *Next = BundleMember->NextInBundle;
  2488. BundleMember->NextInBundle = nullptr;
  2489. BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
  2490. if (BundleMember->UnscheduledDepsInBundle == 0) {
  2491. ReadyInsts.insert(BundleMember);
  2492. }
  2493. BundleMember = Next;
  2494. }
  2495. }
  2496. bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
  2497. if (getScheduleData(V))
  2498. return true;
  2499. Instruction *I = dyn_cast<Instruction>(V);
  2500. assert(I && "bundle member must be an instruction");
  2501. assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
  2502. if (!ScheduleStart) {
  2503. // It's the first instruction in the new region.
  2504. initScheduleData(I, I->getNextNode(), nullptr, nullptr);
  2505. ScheduleStart = I;
  2506. ScheduleEnd = I->getNextNode();
  2507. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2508. DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
  2509. return true;
  2510. }
  2511. // Search up and down at the same time, because we don't know if the new
  2512. // instruction is above or below the existing scheduling region.
  2513. BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
  2514. BasicBlock::reverse_iterator UpperEnd = BB->rend();
  2515. BasicBlock::iterator DownIter(ScheduleEnd);
  2516. BasicBlock::iterator LowerEnd = BB->end();
  2517. for (;;) {
  2518. if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
  2519. DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
  2520. return false;
  2521. }
  2522. if (UpIter != UpperEnd) {
  2523. if (&*UpIter == I) {
  2524. initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
  2525. ScheduleStart = I;
  2526. DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
  2527. return true;
  2528. }
  2529. UpIter++;
  2530. }
  2531. if (DownIter != LowerEnd) {
  2532. if (&*DownIter == I) {
  2533. initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
  2534. nullptr);
  2535. ScheduleEnd = I->getNextNode();
  2536. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2537. DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
  2538. return true;
  2539. }
  2540. DownIter++;
  2541. }
  2542. assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
  2543. "instruction not found in block");
  2544. }
  2545. return true;
  2546. }
  2547. void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
  2548. Instruction *ToI,
  2549. ScheduleData *PrevLoadStore,
  2550. ScheduleData *NextLoadStore) {
  2551. ScheduleData *CurrentLoadStore = PrevLoadStore;
  2552. for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
  2553. ScheduleData *SD = ScheduleDataMap[I];
  2554. if (!SD) {
  2555. // Allocate a new ScheduleData for the instruction.
  2556. if (ChunkPos >= ChunkSize) {
  2557. ScheduleDataChunks.push_back(
  2558. llvm::make_unique<ScheduleData[]>(ChunkSize));
  2559. ChunkPos = 0;
  2560. }
  2561. SD = &(ScheduleDataChunks.back()[ChunkPos++]);
  2562. ScheduleDataMap[I] = SD;
  2563. SD->Inst = I;
  2564. }
  2565. assert(!isInSchedulingRegion(SD) &&
  2566. "new ScheduleData already in scheduling region");
  2567. SD->init(SchedulingRegionID);
  2568. if (I->mayReadOrWriteMemory()) {
  2569. // Update the linked list of memory accessing instructions.
  2570. if (CurrentLoadStore) {
  2571. CurrentLoadStore->NextLoadStore = SD;
  2572. } else {
  2573. FirstLoadStoreInRegion = SD;
  2574. }
  2575. CurrentLoadStore = SD;
  2576. }
  2577. }
  2578. if (NextLoadStore) {
  2579. if (CurrentLoadStore)
  2580. CurrentLoadStore->NextLoadStore = NextLoadStore;
  2581. } else {
  2582. LastLoadStoreInRegion = CurrentLoadStore;
  2583. }
  2584. }
  2585. void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
  2586. bool InsertInReadyList,
  2587. BoUpSLP *SLP) {
  2588. assert(SD->isSchedulingEntity());
  2589. SmallVector<ScheduleData *, 10> WorkList;
  2590. WorkList.push_back(SD);
  2591. while (!WorkList.empty()) {
  2592. ScheduleData *SD = WorkList.back();
  2593. WorkList.pop_back();
  2594. ScheduleData *BundleMember = SD;
  2595. while (BundleMember) {
  2596. assert(isInSchedulingRegion(BundleMember));
  2597. if (!BundleMember->hasValidDependencies()) {
  2598. DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
  2599. BundleMember->Dependencies = 0;
  2600. BundleMember->resetUnscheduledDeps();
  2601. // Handle def-use chain dependencies.
  2602. for (User *U : BundleMember->Inst->users()) {
  2603. if (isa<Instruction>(U)) {
  2604. ScheduleData *UseSD = getScheduleData(U);
  2605. if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
  2606. BundleMember->Dependencies++;
  2607. ScheduleData *DestBundle = UseSD->FirstInBundle;
  2608. if (!DestBundle->IsScheduled) {
  2609. BundleMember->incrementUnscheduledDeps(1);
  2610. }
  2611. if (!DestBundle->hasValidDependencies()) {
  2612. WorkList.push_back(DestBundle);
  2613. }
  2614. }
  2615. } else {
  2616. // I'm not sure if this can ever happen. But we need to be safe.
  2617. // This lets the instruction/bundle never be scheduled and
  2618. // eventually disable vectorization.
  2619. BundleMember->Dependencies++;
  2620. BundleMember->incrementUnscheduledDeps(1);
  2621. }
  2622. }
  2623. // Handle the memory dependencies.
  2624. ScheduleData *DepDest = BundleMember->NextLoadStore;
  2625. if (DepDest) {
  2626. Instruction *SrcInst = BundleMember->Inst;
  2627. MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
  2628. bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
  2629. unsigned numAliased = 0;
  2630. unsigned DistToSrc = 1;
  2631. while (DepDest) {
  2632. assert(isInSchedulingRegion(DepDest));
  2633. // We have two limits to reduce the complexity:
  2634. // 1) AliasedCheckLimit: It's a small limit to reduce calls to
  2635. // SLP->isAliased (which is the expensive part in this loop).
  2636. // 2) MaxMemDepDistance: It's for very large blocks and it aborts
  2637. // the whole loop (even if the loop is fast, it's quadratic).
  2638. // It's important for the loop break condition (see below) to
  2639. // check this limit even between two read-only instructions.
  2640. if (DistToSrc >= MaxMemDepDistance ||
  2641. ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
  2642. (numAliased >= AliasedCheckLimit ||
  2643. SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
  2644. // We increment the counter only if the locations are aliased
  2645. // (instead of counting all alias checks). This gives a better
  2646. // balance between reduced runtime and accurate dependencies.
  2647. numAliased++;
  2648. DepDest->MemoryDependencies.push_back(BundleMember);
  2649. BundleMember->Dependencies++;
  2650. ScheduleData *DestBundle = DepDest->FirstInBundle;
  2651. if (!DestBundle->IsScheduled) {
  2652. BundleMember->incrementUnscheduledDeps(1);
  2653. }
  2654. if (!DestBundle->hasValidDependencies()) {
  2655. WorkList.push_back(DestBundle);
  2656. }
  2657. }
  2658. DepDest = DepDest->NextLoadStore;
  2659. // Example, explaining the loop break condition: Let's assume our
  2660. // starting instruction is i0 and MaxMemDepDistance = 3.
  2661. //
  2662. // +--------v--v--v
  2663. // i0,i1,i2,i3,i4,i5,i6,i7,i8
  2664. // +--------^--^--^
  2665. //
  2666. // MaxMemDepDistance let us stop alias-checking at i3 and we add
  2667. // dependencies from i0 to i3,i4,.. (even if they are not aliased).
  2668. // Previously we already added dependencies from i3 to i6,i7,i8
  2669. // (because of MaxMemDepDistance). As we added a dependency from
  2670. // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
  2671. // and we can abort this loop at i6.
  2672. if (DistToSrc >= 2 * MaxMemDepDistance)
  2673. break;
  2674. DistToSrc++;
  2675. }
  2676. }
  2677. }
  2678. BundleMember = BundleMember->NextInBundle;
  2679. }
  2680. if (InsertInReadyList && SD->isReady()) {
  2681. ReadyInsts.push_back(SD);
  2682. DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
  2683. }
  2684. }
  2685. }
  2686. void BoUpSLP::BlockScheduling::resetSchedule() {
  2687. assert(ScheduleStart &&
  2688. "tried to reset schedule on block which has not been scheduled");
  2689. for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  2690. ScheduleData *SD = getScheduleData(I);
  2691. assert(isInSchedulingRegion(SD));
  2692. SD->IsScheduled = false;
  2693. SD->resetUnscheduledDeps();
  2694. }
  2695. ReadyInsts.clear();
  2696. }
  2697. void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
  2698. if (!BS->ScheduleStart)
  2699. return;
  2700. DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
  2701. BS->resetSchedule();
  2702. // For the real scheduling we use a more sophisticated ready-list: it is
  2703. // sorted by the original instruction location. This lets the final schedule
  2704. // be as close as possible to the original instruction order.
  2705. struct ScheduleDataCompare {
  2706. bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
  2707. return SD2->SchedulingPriority < SD1->SchedulingPriority;
  2708. }
  2709. };
  2710. std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
  2711. // Ensure that all dependency data is updated and fill the ready-list with
  2712. // initial instructions.
  2713. int Idx = 0;
  2714. int NumToSchedule = 0;
  2715. for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
  2716. I = I->getNextNode()) {
  2717. ScheduleData *SD = BS->getScheduleData(I);
  2718. assert(
  2719. SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
  2720. "scheduler and vectorizer have different opinion on what is a bundle");
  2721. SD->FirstInBundle->SchedulingPriority = Idx++;
  2722. if (SD->isSchedulingEntity()) {
  2723. BS->calculateDependencies(SD, false, this);
  2724. NumToSchedule++;
  2725. }
  2726. }
  2727. BS->initialFillReadyList(ReadyInsts);
  2728. Instruction *LastScheduledInst = BS->ScheduleEnd;
  2729. // Do the "real" scheduling.
  2730. while (!ReadyInsts.empty()) {
  2731. ScheduleData *picked = *ReadyInsts.begin();
  2732. ReadyInsts.erase(ReadyInsts.begin());
  2733. // Move the scheduled instruction(s) to their dedicated places, if not
  2734. // there yet.
  2735. ScheduleData *BundleMember = picked;
  2736. while (BundleMember) {
  2737. Instruction *pickedInst = BundleMember->Inst;
  2738. if (LastScheduledInst->getNextNode() != pickedInst) {
  2739. BS->BB->getInstList().remove(pickedInst);
  2740. BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
  2741. pickedInst);
  2742. }
  2743. LastScheduledInst = pickedInst;
  2744. BundleMember = BundleMember->NextInBundle;
  2745. }
  2746. BS->schedule(picked, ReadyInsts);
  2747. NumToSchedule--;
  2748. }
  2749. assert(NumToSchedule == 0 && "could not schedule all instructions");
  2750. // Avoid duplicate scheduling of the block.
  2751. BS->ScheduleStart = nullptr;
  2752. }
  2753. unsigned BoUpSLP::getVectorElementSize(Value *V) {
  2754. // If V is a store, just return the width of the stored value without
  2755. // traversing the expression tree. This is the common case.
  2756. if (auto *Store = dyn_cast<StoreInst>(V))
  2757. return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
  2758. // If V is not a store, we can traverse the expression tree to find loads
  2759. // that feed it. The type of the loaded value may indicate a more suitable
  2760. // width than V's type. We want to base the vector element size on the width
  2761. // of memory operations where possible.
  2762. SmallVector<Instruction *, 16> Worklist;
  2763. SmallPtrSet<Instruction *, 16> Visited;
  2764. if (auto *I = dyn_cast<Instruction>(V))
  2765. Worklist.push_back(I);
  2766. // Traverse the expression tree in bottom-up order looking for loads. If we
  2767. // encounter an instruciton we don't yet handle, we give up.
  2768. auto MaxWidth = 0u;
  2769. auto FoundUnknownInst = false;
  2770. while (!Worklist.empty() && !FoundUnknownInst) {
  2771. auto *I = Worklist.pop_back_val();
  2772. Visited.insert(I);
  2773. // We should only be looking at scalar instructions here. If the current
  2774. // instruction has a vector type, give up.
  2775. auto *Ty = I->getType();
  2776. if (isa<VectorType>(Ty))
  2777. FoundUnknownInst = true;
  2778. // If the current instruction is a load, update MaxWidth to reflect the
  2779. // width of the loaded value.
  2780. else if (isa<LoadInst>(I))
  2781. MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
  2782. // Otherwise, we need to visit the operands of the instruction. We only
  2783. // handle the interesting cases from buildTree here. If an operand is an
  2784. // instruction we haven't yet visited, we add it to the worklist.
  2785. else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
  2786. isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
  2787. for (Use &U : I->operands())
  2788. if (auto *J = dyn_cast<Instruction>(U.get()))
  2789. if (!Visited.count(J))
  2790. Worklist.push_back(J);
  2791. }
  2792. // If we don't yet handle the instruction, give up.
  2793. else
  2794. FoundUnknownInst = true;
  2795. }
  2796. // If we didn't encounter a memory access in the expression tree, or if we
  2797. // gave up for some reason, just return the width of V.
  2798. if (!MaxWidth || FoundUnknownInst)
  2799. return DL->getTypeSizeInBits(V->getType());
  2800. // Otherwise, return the maximum width we found.
  2801. return MaxWidth;
  2802. }
  2803. // Determine if a value V in a vectorizable expression Expr can be demoted to a
  2804. // smaller type with a truncation. We collect the values that will be demoted
  2805. // in ToDemote and additional roots that require investigating in Roots.
  2806. static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
  2807. SmallVectorImpl<Value *> &ToDemote,
  2808. SmallVectorImpl<Value *> &Roots) {
  2809. // We can always demote constants.
  2810. if (isa<Constant>(V)) {
  2811. ToDemote.push_back(V);
  2812. return true;
  2813. }
  2814. // If the value is not an instruction in the expression with only one use, it
  2815. // cannot be demoted.
  2816. auto *I = dyn_cast<Instruction>(V);
  2817. if (!I || !I->hasOneUse() || !Expr.count(I))
  2818. return false;
  2819. switch (I->getOpcode()) {
  2820. // We can always demote truncations and extensions. Since truncations can
  2821. // seed additional demotion, we save the truncated value.
  2822. case Instruction::Trunc:
  2823. Roots.push_back(I->getOperand(0));
  2824. case Instruction::ZExt:
  2825. case Instruction::SExt:
  2826. break;
  2827. // We can demote certain binary operations if we can demote both of their
  2828. // operands.
  2829. case Instruction::Add:
  2830. case Instruction::Sub:
  2831. case Instruction::Mul:
  2832. case Instruction::And:
  2833. case Instruction::Or:
  2834. case Instruction::Xor:
  2835. if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
  2836. !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
  2837. return false;
  2838. break;
  2839. // We can demote selects if we can demote their true and false values.
  2840. case Instruction::Select: {
  2841. SelectInst *SI = cast<SelectInst>(I);
  2842. if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
  2843. !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
  2844. return false;
  2845. break;
  2846. }
  2847. // We can demote phis if we can demote all their incoming operands. Note that
  2848. // we don't need to worry about cycles since we ensure single use above.
  2849. case Instruction::PHI: {
  2850. PHINode *PN = cast<PHINode>(I);
  2851. for (Value *IncValue : PN->incoming_values())
  2852. if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
  2853. return false;
  2854. break;
  2855. }
  2856. // Otherwise, conservatively give up.
  2857. default:
  2858. return false;
  2859. }
  2860. // Record the value that we can demote.
  2861. ToDemote.push_back(V);
  2862. return true;
  2863. }
  2864. void BoUpSLP::computeMinimumValueSizes() {
  2865. // If there are no external uses, the expression tree must be rooted by a
  2866. // store. We can't demote in-memory values, so there is nothing to do here.
  2867. if (ExternalUses.empty())
  2868. return;
  2869. // We only attempt to truncate integer expressions.
  2870. auto &TreeRoot = VectorizableTree[0].Scalars;
  2871. auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
  2872. if (!TreeRootIT)
  2873. return;
  2874. // If the expression is not rooted by a store, these roots should have
  2875. // external uses. We will rely on InstCombine to rewrite the expression in
  2876. // the narrower type. However, InstCombine only rewrites single-use values.
  2877. // This means that if a tree entry other than a root is used externally, it
  2878. // must have multiple uses and InstCombine will not rewrite it. The code
  2879. // below ensures that only the roots are used externally.
  2880. SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
  2881. for (auto &EU : ExternalUses)
  2882. if (!Expr.erase(EU.Scalar))
  2883. return;
  2884. if (!Expr.empty())
  2885. return;
  2886. // Collect the scalar values of the vectorizable expression. We will use this
  2887. // context to determine which values can be demoted. If we see a truncation,
  2888. // we mark it as seeding another demotion.
  2889. for (auto &Entry : VectorizableTree)
  2890. Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
  2891. // Ensure the roots of the vectorizable tree don't form a cycle. They must
  2892. // have a single external user that is not in the vectorizable tree.
  2893. for (auto *Root : TreeRoot)
  2894. if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
  2895. return;
  2896. // Conservatively determine if we can actually truncate the roots of the
  2897. // expression. Collect the values that can be demoted in ToDemote and
  2898. // additional roots that require investigating in Roots.
  2899. SmallVector<Value *, 32> ToDemote;
  2900. SmallVector<Value *, 4> Roots;
  2901. for (auto *Root : TreeRoot)
  2902. if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
  2903. return;
  2904. // The maximum bit width required to represent all the values that can be
  2905. // demoted without loss of precision. It would be safe to truncate the roots
  2906. // of the expression to this width.
  2907. auto MaxBitWidth = 8u;
  2908. // We first check if all the bits of the roots are demanded. If they're not,
  2909. // we can truncate the roots to this narrower type.
  2910. for (auto *Root : TreeRoot) {
  2911. auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
  2912. MaxBitWidth = std::max<unsigned>(
  2913. Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
  2914. }
  2915. // If all the bits of the roots are demanded, we can try a little harder to
  2916. // compute a narrower type. This can happen, for example, if the roots are
  2917. // getelementptr indices. InstCombine promotes these indices to the pointer
  2918. // width. Thus, all their bits are technically demanded even though the
  2919. // address computation might be vectorized in a smaller type.
  2920. //
  2921. // We start by looking at each entry that can be demoted. We compute the
  2922. // maximum bit width required to store the scalar by using ValueTracking to
  2923. // compute the number of high-order bits we can truncate.
  2924. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
  2925. MaxBitWidth = 8u;
  2926. for (auto *Scalar : ToDemote) {
  2927. auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
  2928. auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
  2929. MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
  2930. }
  2931. }
  2932. // Round MaxBitWidth up to the next power-of-two.
  2933. if (!isPowerOf2_64(MaxBitWidth))
  2934. MaxBitWidth = NextPowerOf2(MaxBitWidth);
  2935. // If the maximum bit width we compute is less than the with of the roots'
  2936. // type, we can proceed with the narrowing. Otherwise, do nothing.
  2937. if (MaxBitWidth >= TreeRootIT->getBitWidth())
  2938. return;
  2939. // If we can truncate the root, we must collect additional values that might
  2940. // be demoted as a result. That is, those seeded by truncations we will
  2941. // modify.
  2942. while (!Roots.empty())
  2943. collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
  2944. // Finally, map the values we can demote to the maximum bit with we computed.
  2945. for (auto *Scalar : ToDemote)
  2946. MinBWs[Scalar] = MaxBitWidth;
  2947. }
  2948. /// The SLPVectorizer Pass.
  2949. struct SLPVectorizer : public FunctionPass {
  2950. typedef SmallVector<StoreInst *, 8> StoreList;
  2951. typedef MapVector<Value *, StoreList> StoreListMap;
  2952. typedef SmallVector<WeakVH, 8> WeakVHList;
  2953. typedef MapVector<Value *, WeakVHList> WeakVHListMap;
  2954. /// Pass identification, replacement for typeid
  2955. static char ID;
  2956. explicit SLPVectorizer() : FunctionPass(ID) {
  2957. initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
  2958. }
  2959. ScalarEvolution *SE;
  2960. TargetTransformInfo *TTI;
  2961. TargetLibraryInfo *TLI;
  2962. AliasAnalysis *AA;
  2963. LoopInfo *LI;
  2964. DominatorTree *DT;
  2965. AssumptionCache *AC;
  2966. DemandedBits *DB;
  2967. const DataLayout *DL;
  2968. bool doInitialization(Module &M) override {
  2969. DL = &M.getDataLayout();
  2970. return false;
  2971. }
  2972. bool runOnFunction(Function &F) override {
  2973. if (skipFunction(F))
  2974. return false;
  2975. SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  2976. TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  2977. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  2978. TLI = TLIP ? &TLIP->getTLI() : nullptr;
  2979. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  2980. LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  2981. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  2982. AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  2983. DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
  2984. Stores.clear();
  2985. GEPs.clear();
  2986. bool Changed = false;
  2987. // If the target claims to have no vector registers don't attempt
  2988. // vectorization.
  2989. if (!TTI->getNumberOfRegisters(true))
  2990. return false;
  2991. // Use the vector register size specified by the target unless overridden
  2992. // by a command-line option.
  2993. // TODO: It would be better to limit the vectorization factor based on
  2994. // data type rather than just register size. For example, x86 AVX has
  2995. // 256-bit registers, but it does not support integer operations
  2996. // at that width (that requires AVX2).
  2997. if (MaxVectorRegSizeOption.getNumOccurrences())
  2998. MaxVecRegSize = MaxVectorRegSizeOption;
  2999. else
  3000. MaxVecRegSize = TTI->getRegisterBitWidth(true);
  3001. MinVecRegSize = MinVectorRegSizeOption;
  3002. // Don't vectorize when the attribute NoImplicitFloat is used.
  3003. if (F.hasFnAttribute(Attribute::NoImplicitFloat))
  3004. return false;
  3005. DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
  3006. // Use the bottom up slp vectorizer to construct chains that start with
  3007. // store instructions.
  3008. BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
  3009. // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
  3010. // delete instructions.
  3011. // Scan the blocks in the function in post order.
  3012. for (auto BB : post_order(&F.getEntryBlock())) {
  3013. collectSeedInstructions(BB);
  3014. // Vectorize trees that end at stores.
  3015. if (!Stores.empty()) {
  3016. DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
  3017. << " underlying objects.\n");
  3018. Changed |= vectorizeStoreChains(R);
  3019. }
  3020. // Vectorize trees that end at reductions.
  3021. Changed |= vectorizeChainsInBlock(BB, R);
  3022. // Vectorize the index computations of getelementptr instructions. This
  3023. // is primarily intended to catch gather-like idioms ending at
  3024. // non-consecutive loads.
  3025. if (!GEPs.empty()) {
  3026. DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
  3027. << " underlying objects.\n");
  3028. Changed |= vectorizeGEPIndices(BB, R);
  3029. }
  3030. }
  3031. if (Changed) {
  3032. R.optimizeGatherSequence();
  3033. DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
  3034. DEBUG(verifyFunction(F));
  3035. }
  3036. return Changed;
  3037. }
  3038. void getAnalysisUsage(AnalysisUsage &AU) const override {
  3039. FunctionPass::getAnalysisUsage(AU);
  3040. AU.addRequired<AssumptionCacheTracker>();
  3041. AU.addRequired<ScalarEvolutionWrapperPass>();
  3042. AU.addRequired<AAResultsWrapperPass>();
  3043. AU.addRequired<TargetTransformInfoWrapperPass>();
  3044. AU.addRequired<LoopInfoWrapperPass>();
  3045. AU.addRequired<DominatorTreeWrapperPass>();
  3046. AU.addRequired<DemandedBitsWrapperPass>();
  3047. AU.addPreserved<LoopInfoWrapperPass>();
  3048. AU.addPreserved<DominatorTreeWrapperPass>();
  3049. AU.addPreserved<AAResultsWrapperPass>();
  3050. AU.addPreserved<GlobalsAAWrapperPass>();
  3051. AU.setPreservesCFG();
  3052. }
  3053. private:
  3054. /// \brief Collect store and getelementptr instructions and organize them
  3055. /// according to the underlying object of their pointer operands. We sort the
  3056. /// instructions by their underlying objects to reduce the cost of
  3057. /// consecutive access queries.
  3058. ///
  3059. /// TODO: We can further reduce this cost if we flush the chain creation
  3060. /// every time we run into a memory barrier.
  3061. void collectSeedInstructions(BasicBlock *BB);
  3062. /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
  3063. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
  3064. /// \brief Try to vectorize a list of operands.
  3065. /// \@param BuildVector A list of users to ignore for the purpose of
  3066. /// scheduling and that don't need extracting.
  3067. /// \returns true if a value was vectorized.
  3068. bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  3069. ArrayRef<Value *> BuildVector = None,
  3070. bool allowReorder = false);
  3071. /// \brief Try to vectorize a chain that may start at the operands of \V;
  3072. bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
  3073. /// \brief Vectorize the store instructions collected in Stores.
  3074. bool vectorizeStoreChains(BoUpSLP &R);
  3075. /// \brief Vectorize the index computations of the getelementptr instructions
  3076. /// collected in GEPs.
  3077. bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
  3078. /// \brief Scan the basic block and look for patterns that are likely to start
  3079. /// a vectorization chain.
  3080. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
  3081. bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
  3082. BoUpSLP &R, unsigned VecRegSize);
  3083. bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
  3084. BoUpSLP &R);
  3085. /// The store instructions in a basic block organized by base pointer.
  3086. StoreListMap Stores;
  3087. /// The getelementptr instructions in a basic block organized by base pointer.
  3088. WeakVHListMap GEPs;
  3089. unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
  3090. unsigned MinVecRegSize; // Set by cl::opt (default: 128).
  3091. };
  3092. /// \brief Check that the Values in the slice in VL array are still existent in
  3093. /// the WeakVH array.
  3094. /// Vectorization of part of the VL array may cause later values in the VL array
  3095. /// to become invalid. We track when this has happened in the WeakVH array.
  3096. static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
  3097. unsigned SliceBegin, unsigned SliceSize) {
  3098. VL = VL.slice(SliceBegin, SliceSize);
  3099. VH = VH.slice(SliceBegin, SliceSize);
  3100. return !std::equal(VL.begin(), VL.end(), VH.begin());
  3101. }
  3102. bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
  3103. int CostThreshold, BoUpSLP &R,
  3104. unsigned VecRegSize) {
  3105. unsigned ChainLen = Chain.size();
  3106. DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
  3107. << "\n");
  3108. unsigned Sz = R.getVectorElementSize(Chain[0]);
  3109. unsigned VF = VecRegSize / Sz;
  3110. if (!isPowerOf2_32(Sz) || VF < 2)
  3111. return false;
  3112. // Keep track of values that were deleted by vectorizing in the loop below.
  3113. SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
  3114. bool Changed = false;
  3115. // Look for profitable vectorizable trees at all offsets, starting at zero.
  3116. for (unsigned i = 0, e = ChainLen; i < e; ++i) {
  3117. if (i + VF > e)
  3118. break;
  3119. // Check that a previous iteration of this loop did not delete the Value.
  3120. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
  3121. continue;
  3122. DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
  3123. << "\n");
  3124. ArrayRef<Value *> Operands = Chain.slice(i, VF);
  3125. R.buildTree(Operands);
  3126. R.computeMinimumValueSizes();
  3127. int Cost = R.getTreeCost();
  3128. DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
  3129. if (Cost < CostThreshold) {
  3130. DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
  3131. R.vectorizeTree();
  3132. // Move to the next bundle.
  3133. i += VF - 1;
  3134. Changed = true;
  3135. }
  3136. }
  3137. return Changed;
  3138. }
  3139. bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
  3140. int costThreshold, BoUpSLP &R) {
  3141. SetVector<StoreInst *> Heads, Tails;
  3142. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  3143. // We may run into multiple chains that merge into a single chain. We mark the
  3144. // stores that we vectorized so that we don't visit the same store twice.
  3145. BoUpSLP::ValueSet VectorizedStores;
  3146. bool Changed = false;
  3147. // Do a quadratic search on all of the given stores and find
  3148. // all of the pairs of stores that follow each other.
  3149. SmallVector<unsigned, 16> IndexQueue;
  3150. for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
  3151. IndexQueue.clear();
  3152. // If a store has multiple consecutive store candidates, search Stores
  3153. // array according to the sequence: from i+1 to e, then from i-1 to 0.
  3154. // This is because usually pairing with immediate succeeding or preceding
  3155. // candidate create the best chance to find slp vectorization opportunity.
  3156. unsigned j = 0;
  3157. for (j = i + 1; j < e; ++j)
  3158. IndexQueue.push_back(j);
  3159. for (j = i; j > 0; --j)
  3160. IndexQueue.push_back(j - 1);
  3161. for (auto &k : IndexQueue) {
  3162. if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
  3163. Tails.insert(Stores[k]);
  3164. Heads.insert(Stores[i]);
  3165. ConsecutiveChain[Stores[i]] = Stores[k];
  3166. break;
  3167. }
  3168. }
  3169. }
  3170. // For stores that start but don't end a link in the chain:
  3171. for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
  3172. it != e; ++it) {
  3173. if (Tails.count(*it))
  3174. continue;
  3175. // We found a store instr that starts a chain. Now follow the chain and try
  3176. // to vectorize it.
  3177. BoUpSLP::ValueList Operands;
  3178. StoreInst *I = *it;
  3179. // Collect the chain into a list.
  3180. while (Tails.count(I) || Heads.count(I)) {
  3181. if (VectorizedStores.count(I))
  3182. break;
  3183. Operands.push_back(I);
  3184. // Move to the next value in the chain.
  3185. I = ConsecutiveChain[I];
  3186. }
  3187. // FIXME: Is division-by-2 the correct step? Should we assert that the
  3188. // register size is a power-of-2?
  3189. for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
  3190. if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
  3191. // Mark the vectorized stores so that we don't vectorize them again.
  3192. VectorizedStores.insert(Operands.begin(), Operands.end());
  3193. Changed = true;
  3194. break;
  3195. }
  3196. }
  3197. }
  3198. return Changed;
  3199. }
  3200. void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
  3201. // Initialize the collections. We will make a single pass over the block.
  3202. Stores.clear();
  3203. GEPs.clear();
  3204. // Visit the store and getelementptr instructions in BB and organize them in
  3205. // Stores and GEPs according to the underlying objects of their pointer
  3206. // operands.
  3207. for (Instruction &I : *BB) {
  3208. // Ignore store instructions that are volatile or have a pointer operand
  3209. // that doesn't point to a scalar type.
  3210. if (auto *SI = dyn_cast<StoreInst>(&I)) {
  3211. if (!SI->isSimple())
  3212. continue;
  3213. if (!isValidElementType(SI->getValueOperand()->getType()))
  3214. continue;
  3215. Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
  3216. }
  3217. // Ignore getelementptr instructions that have more than one index, a
  3218. // constant index, or a pointer operand that doesn't point to a scalar
  3219. // type.
  3220. else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
  3221. auto Idx = GEP->idx_begin()->get();
  3222. if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
  3223. continue;
  3224. if (!isValidElementType(Idx->getType()))
  3225. continue;
  3226. GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
  3227. }
  3228. }
  3229. }
  3230. bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  3231. if (!A || !B)
  3232. return false;
  3233. Value *VL[] = { A, B };
  3234. return tryToVectorizeList(VL, R, None, true);
  3235. }
  3236. bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  3237. ArrayRef<Value *> BuildVector,
  3238. bool allowReorder) {
  3239. if (VL.size() < 2)
  3240. return false;
  3241. DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
  3242. // Check that all of the parts are scalar instructions of the same type.
  3243. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  3244. if (!I0)
  3245. return false;
  3246. unsigned Opcode0 = I0->getOpcode();
  3247. // FIXME: Register size should be a parameter to this function, so we can
  3248. // try different vectorization factors.
  3249. unsigned Sz = R.getVectorElementSize(I0);
  3250. unsigned VF = MinVecRegSize / Sz;
  3251. for (Value *V : VL) {
  3252. Type *Ty = V->getType();
  3253. if (!isValidElementType(Ty))
  3254. return false;
  3255. Instruction *Inst = dyn_cast<Instruction>(V);
  3256. if (!Inst || Inst->getOpcode() != Opcode0)
  3257. return false;
  3258. }
  3259. bool Changed = false;
  3260. // Keep track of values that were deleted by vectorizing in the loop below.
  3261. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
  3262. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  3263. unsigned OpsWidth = 0;
  3264. if (i + VF > e)
  3265. OpsWidth = e - i;
  3266. else
  3267. OpsWidth = VF;
  3268. if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
  3269. break;
  3270. // Check that a previous iteration of this loop did not delete the Value.
  3271. if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
  3272. continue;
  3273. DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
  3274. << "\n");
  3275. ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
  3276. ArrayRef<Value *> BuildVectorSlice;
  3277. if (!BuildVector.empty())
  3278. BuildVectorSlice = BuildVector.slice(i, OpsWidth);
  3279. R.buildTree(Ops, BuildVectorSlice);
  3280. // TODO: check if we can allow reordering also for other cases than
  3281. // tryToVectorizePair()
  3282. if (allowReorder && R.shouldReorder()) {
  3283. assert(Ops.size() == 2);
  3284. assert(BuildVectorSlice.empty());
  3285. Value *ReorderedOps[] = { Ops[1], Ops[0] };
  3286. R.buildTree(ReorderedOps, None);
  3287. }
  3288. R.computeMinimumValueSizes();
  3289. int Cost = R.getTreeCost();
  3290. if (Cost < -SLPCostThreshold) {
  3291. DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
  3292. Value *VectorizedRoot = R.vectorizeTree();
  3293. // Reconstruct the build vector by extracting the vectorized root. This
  3294. // way we handle the case where some elements of the vector are undefined.
  3295. // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
  3296. if (!BuildVectorSlice.empty()) {
  3297. // The insert point is the last build vector instruction. The vectorized
  3298. // root will precede it. This guarantees that we get an instruction. The
  3299. // vectorized tree could have been constant folded.
  3300. Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
  3301. unsigned VecIdx = 0;
  3302. for (auto &V : BuildVectorSlice) {
  3303. IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
  3304. ++BasicBlock::iterator(InsertAfter));
  3305. InsertElementInst *IE = cast<InsertElementInst>(V);
  3306. Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
  3307. VectorizedRoot, Builder.getInt32(VecIdx++)));
  3308. IE->setOperand(1, Extract);
  3309. IE->removeFromParent();
  3310. IE->insertAfter(Extract);
  3311. InsertAfter = IE;
  3312. }
  3313. }
  3314. // Move to the next bundle.
  3315. i += VF - 1;
  3316. Changed = true;
  3317. }
  3318. }
  3319. return Changed;
  3320. }
  3321. bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
  3322. if (!V)
  3323. return false;
  3324. // Try to vectorize V.
  3325. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
  3326. return true;
  3327. BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
  3328. BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
  3329. // Try to skip B.
  3330. if (B && B->hasOneUse()) {
  3331. BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
  3332. BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
  3333. if (tryToVectorizePair(A, B0, R)) {
  3334. return true;
  3335. }
  3336. if (tryToVectorizePair(A, B1, R)) {
  3337. return true;
  3338. }
  3339. }
  3340. // Try to skip A.
  3341. if (A && A->hasOneUse()) {
  3342. BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
  3343. BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
  3344. if (tryToVectorizePair(A0, B, R)) {
  3345. return true;
  3346. }
  3347. if (tryToVectorizePair(A1, B, R)) {
  3348. return true;
  3349. }
  3350. }
  3351. return 0;
  3352. }
  3353. /// \brief Generate a shuffle mask to be used in a reduction tree.
  3354. ///
  3355. /// \param VecLen The length of the vector to be reduced.
  3356. /// \param NumEltsToRdx The number of elements that should be reduced in the
  3357. /// vector.
  3358. /// \param IsPairwise Whether the reduction is a pairwise or splitting
  3359. /// reduction. A pairwise reduction will generate a mask of
  3360. /// <0,2,...> or <1,3,..> while a splitting reduction will generate
  3361. /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
  3362. /// \param IsLeft True will generate a mask of even elements, odd otherwise.
  3363. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
  3364. bool IsPairwise, bool IsLeft,
  3365. IRBuilder<> &Builder) {
  3366. assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
  3367. SmallVector<Constant *, 32> ShuffleMask(
  3368. VecLen, UndefValue::get(Builder.getInt32Ty()));
  3369. if (IsPairwise)
  3370. // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
  3371. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3372. ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
  3373. else
  3374. // Move the upper half of the vector to the lower half.
  3375. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3376. ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
  3377. return ConstantVector::get(ShuffleMask);
  3378. }
  3379. /// Model horizontal reductions.
  3380. ///
  3381. /// A horizontal reduction is a tree of reduction operations (currently add and
  3382. /// fadd) that has operations that can be put into a vector as its leaf.
  3383. /// For example, this tree:
  3384. ///
  3385. /// mul mul mul mul
  3386. /// \ / \ /
  3387. /// + +
  3388. /// \ /
  3389. /// +
  3390. /// This tree has "mul" as its reduced values and "+" as its reduction
  3391. /// operations. A reduction might be feeding into a store or a binary operation
  3392. /// feeding a phi.
  3393. /// ...
  3394. /// \ /
  3395. /// +
  3396. /// |
  3397. /// phi +=
  3398. ///
  3399. /// Or:
  3400. /// ...
  3401. /// \ /
  3402. /// +
  3403. /// |
  3404. /// *p =
  3405. ///
  3406. class HorizontalReduction {
  3407. SmallVector<Value *, 16> ReductionOps;
  3408. SmallVector<Value *, 32> ReducedVals;
  3409. BinaryOperator *ReductionRoot;
  3410. PHINode *ReductionPHI;
  3411. /// The opcode of the reduction.
  3412. unsigned ReductionOpcode;
  3413. /// The opcode of the values we perform a reduction on.
  3414. unsigned ReducedValueOpcode;
  3415. /// Should we model this reduction as a pairwise reduction tree or a tree that
  3416. /// splits the vector in halves and adds those halves.
  3417. bool IsPairwiseReduction;
  3418. public:
  3419. /// The width of one full horizontal reduction operation.
  3420. unsigned ReduxWidth;
  3421. /// Minimal width of available vector registers. It's used to determine
  3422. /// ReduxWidth.
  3423. unsigned MinVecRegSize;
  3424. HorizontalReduction(unsigned MinVecRegSize)
  3425. : ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
  3426. ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
  3427. MinVecRegSize(MinVecRegSize) {}
  3428. /// \brief Try to find a reduction tree.
  3429. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
  3430. assert((!Phi ||
  3431. std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
  3432. "Thi phi needs to use the binary operator");
  3433. // We could have a initial reductions that is not an add.
  3434. // r *= v1 + v2 + v3 + v4
  3435. // In such a case start looking for a tree rooted in the first '+'.
  3436. if (Phi) {
  3437. if (B->getOperand(0) == Phi) {
  3438. Phi = nullptr;
  3439. B = dyn_cast<BinaryOperator>(B->getOperand(1));
  3440. } else if (B->getOperand(1) == Phi) {
  3441. Phi = nullptr;
  3442. B = dyn_cast<BinaryOperator>(B->getOperand(0));
  3443. }
  3444. }
  3445. if (!B)
  3446. return false;
  3447. Type *Ty = B->getType();
  3448. if (!isValidElementType(Ty))
  3449. return false;
  3450. const DataLayout &DL = B->getModule()->getDataLayout();
  3451. ReductionOpcode = B->getOpcode();
  3452. ReducedValueOpcode = 0;
  3453. // FIXME: Register size should be a parameter to this function, so we can
  3454. // try different vectorization factors.
  3455. ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
  3456. ReductionRoot = B;
  3457. ReductionPHI = Phi;
  3458. if (ReduxWidth < 4)
  3459. return false;
  3460. // We currently only support adds.
  3461. if (ReductionOpcode != Instruction::Add &&
  3462. ReductionOpcode != Instruction::FAdd)
  3463. return false;
  3464. // Post order traverse the reduction tree starting at B. We only handle true
  3465. // trees containing only binary operators or selects.
  3466. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
  3467. Stack.push_back(std::make_pair(B, 0));
  3468. while (!Stack.empty()) {
  3469. Instruction *TreeN = Stack.back().first;
  3470. unsigned EdgeToVist = Stack.back().second++;
  3471. bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
  3472. // Only handle trees in the current basic block.
  3473. if (TreeN->getParent() != B->getParent())
  3474. return false;
  3475. // Each tree node needs to have one user except for the ultimate
  3476. // reduction.
  3477. if (!TreeN->hasOneUse() && TreeN != B)
  3478. return false;
  3479. // Postorder vist.
  3480. if (EdgeToVist == 2 || IsReducedValue) {
  3481. if (IsReducedValue) {
  3482. // Make sure that the opcodes of the operations that we are going to
  3483. // reduce match.
  3484. if (!ReducedValueOpcode)
  3485. ReducedValueOpcode = TreeN->getOpcode();
  3486. else if (ReducedValueOpcode != TreeN->getOpcode())
  3487. return false;
  3488. ReducedVals.push_back(TreeN);
  3489. } else {
  3490. // We need to be able to reassociate the adds.
  3491. if (!TreeN->isAssociative())
  3492. return false;
  3493. ReductionOps.push_back(TreeN);
  3494. }
  3495. // Retract.
  3496. Stack.pop_back();
  3497. continue;
  3498. }
  3499. // Visit left or right.
  3500. Value *NextV = TreeN->getOperand(EdgeToVist);
  3501. // We currently only allow BinaryOperator's and SelectInst's as reduction
  3502. // values in our tree.
  3503. if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
  3504. Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
  3505. else if (NextV != Phi)
  3506. return false;
  3507. }
  3508. return true;
  3509. }
  3510. /// \brief Attempt to vectorize the tree found by
  3511. /// matchAssociativeReduction.
  3512. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
  3513. if (ReducedVals.empty())
  3514. return false;
  3515. unsigned NumReducedVals = ReducedVals.size();
  3516. if (NumReducedVals < ReduxWidth)
  3517. return false;
  3518. Value *VectorizedTree = nullptr;
  3519. IRBuilder<> Builder(ReductionRoot);
  3520. FastMathFlags Unsafe;
  3521. Unsafe.setUnsafeAlgebra();
  3522. Builder.setFastMathFlags(Unsafe);
  3523. unsigned i = 0;
  3524. for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
  3525. V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
  3526. V.computeMinimumValueSizes();
  3527. // Estimate cost.
  3528. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
  3529. if (Cost >= -SLPCostThreshold)
  3530. break;
  3531. DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
  3532. << ". (HorRdx)\n");
  3533. // Vectorize a tree.
  3534. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
  3535. Value *VectorizedRoot = V.vectorizeTree();
  3536. // Emit a reduction.
  3537. Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
  3538. if (VectorizedTree) {
  3539. Builder.SetCurrentDebugLocation(Loc);
  3540. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  3541. ReducedSubTree, "bin.rdx");
  3542. } else
  3543. VectorizedTree = ReducedSubTree;
  3544. }
  3545. if (VectorizedTree) {
  3546. // Finish the reduction.
  3547. for (; i < NumReducedVals; ++i) {
  3548. Builder.SetCurrentDebugLocation(
  3549. cast<Instruction>(ReducedVals[i])->getDebugLoc());
  3550. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  3551. ReducedVals[i]);
  3552. }
  3553. // Update users.
  3554. if (ReductionPHI) {
  3555. assert(ReductionRoot && "Need a reduction operation");
  3556. ReductionRoot->setOperand(0, VectorizedTree);
  3557. ReductionRoot->setOperand(1, ReductionPHI);
  3558. } else
  3559. ReductionRoot->replaceAllUsesWith(VectorizedTree);
  3560. }
  3561. return VectorizedTree != nullptr;
  3562. }
  3563. unsigned numReductionValues() const {
  3564. return ReducedVals.size();
  3565. }
  3566. private:
  3567. /// \brief Calculate the cost of a reduction.
  3568. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
  3569. Type *ScalarTy = FirstReducedVal->getType();
  3570. Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
  3571. int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
  3572. int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
  3573. IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
  3574. int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
  3575. int ScalarReduxCost =
  3576. ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
  3577. DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
  3578. << " for reduction that starts with " << *FirstReducedVal
  3579. << " (It is a "
  3580. << (IsPairwiseReduction ? "pairwise" : "splitting")
  3581. << " reduction)\n");
  3582. return VecReduxCost - ScalarReduxCost;
  3583. }
  3584. static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
  3585. Value *R, const Twine &Name = "") {
  3586. if (Opcode == Instruction::FAdd)
  3587. return Builder.CreateFAdd(L, R, Name);
  3588. return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
  3589. }
  3590. /// \brief Emit a horizontal reduction of the vectorized value.
  3591. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
  3592. assert(VectorizedValue && "Need to have a vectorized tree node");
  3593. assert(isPowerOf2_32(ReduxWidth) &&
  3594. "We only handle power-of-two reductions for now");
  3595. Value *TmpVec = VectorizedValue;
  3596. for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
  3597. if (IsPairwiseReduction) {
  3598. Value *LeftMask =
  3599. createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
  3600. Value *RightMask =
  3601. createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
  3602. Value *LeftShuf = Builder.CreateShuffleVector(
  3603. TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
  3604. Value *RightShuf = Builder.CreateShuffleVector(
  3605. TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
  3606. "rdx.shuf.r");
  3607. TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
  3608. "bin.rdx");
  3609. } else {
  3610. Value *UpperHalf =
  3611. createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
  3612. Value *Shuf = Builder.CreateShuffleVector(
  3613. TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
  3614. TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
  3615. }
  3616. }
  3617. // The result is in the first element of the vector.
  3618. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  3619. }
  3620. };
  3621. /// \brief Recognize construction of vectors like
  3622. /// %ra = insertelement <4 x float> undef, float %s0, i32 0
  3623. /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
  3624. /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
  3625. /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
  3626. ///
  3627. /// Returns true if it matches
  3628. ///
  3629. static bool findBuildVector(InsertElementInst *FirstInsertElem,
  3630. SmallVectorImpl<Value *> &BuildVector,
  3631. SmallVectorImpl<Value *> &BuildVectorOpds) {
  3632. if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
  3633. return false;
  3634. InsertElementInst *IE = FirstInsertElem;
  3635. while (true) {
  3636. BuildVector.push_back(IE);
  3637. BuildVectorOpds.push_back(IE->getOperand(1));
  3638. if (IE->use_empty())
  3639. return false;
  3640. InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
  3641. if (!NextUse)
  3642. return true;
  3643. // If this isn't the final use, make sure the next insertelement is the only
  3644. // use. It's OK if the final constructed vector is used multiple times
  3645. if (!IE->hasOneUse())
  3646. return false;
  3647. IE = NextUse;
  3648. }
  3649. return false;
  3650. }
  3651. static bool PhiTypeSorterFunc(Value *V, Value *V2) {
  3652. return V->getType() < V2->getType();
  3653. }
  3654. /// \brief Try and get a reduction value from a phi node.
  3655. ///
  3656. /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
  3657. /// if they come from either \p ParentBB or a containing loop latch.
  3658. ///
  3659. /// \returns A candidate reduction value if possible, or \code nullptr \endcode
  3660. /// if not possible.
  3661. static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
  3662. BasicBlock *ParentBB, LoopInfo *LI) {
  3663. // There are situations where the reduction value is not dominated by the
  3664. // reduction phi. Vectorizing such cases has been reported to cause
  3665. // miscompiles. See PR25787.
  3666. auto DominatedReduxValue = [&](Value *R) {
  3667. return (
  3668. dyn_cast<Instruction>(R) &&
  3669. DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
  3670. };
  3671. Value *Rdx = nullptr;
  3672. // Return the incoming value if it comes from the same BB as the phi node.
  3673. if (P->getIncomingBlock(0) == ParentBB) {
  3674. Rdx = P->getIncomingValue(0);
  3675. } else if (P->getIncomingBlock(1) == ParentBB) {
  3676. Rdx = P->getIncomingValue(1);
  3677. }
  3678. if (Rdx && DominatedReduxValue(Rdx))
  3679. return Rdx;
  3680. // Otherwise, check whether we have a loop latch to look at.
  3681. Loop *BBL = LI->getLoopFor(ParentBB);
  3682. if (!BBL)
  3683. return nullptr;
  3684. BasicBlock *BBLatch = BBL->getLoopLatch();
  3685. if (!BBLatch)
  3686. return nullptr;
  3687. // There is a loop latch, return the incoming value if it comes from
  3688. // that. This reduction pattern occassionaly turns up.
  3689. if (P->getIncomingBlock(0) == BBLatch) {
  3690. Rdx = P->getIncomingValue(0);
  3691. } else if (P->getIncomingBlock(1) == BBLatch) {
  3692. Rdx = P->getIncomingValue(1);
  3693. }
  3694. if (Rdx && DominatedReduxValue(Rdx))
  3695. return Rdx;
  3696. return nullptr;
  3697. }
  3698. /// \brief Attempt to reduce a horizontal reduction.
  3699. /// If it is legal to match a horizontal reduction feeding
  3700. /// the phi node P with reduction operators BI, then check if it
  3701. /// can be done.
  3702. /// \returns true if a horizontal reduction was matched and reduced.
  3703. /// \returns false if a horizontal reduction was not matched.
  3704. static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
  3705. BoUpSLP &R, TargetTransformInfo *TTI,
  3706. unsigned MinRegSize) {
  3707. if (!ShouldVectorizeHor)
  3708. return false;
  3709. HorizontalReduction HorRdx(MinRegSize);
  3710. if (!HorRdx.matchAssociativeReduction(P, BI))
  3711. return false;
  3712. // If there is a sufficient number of reduction values, reduce
  3713. // to a nearby power-of-2. Can safely generate oversized
  3714. // vectors and rely on the backend to split them to legal sizes.
  3715. HorRdx.ReduxWidth =
  3716. std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
  3717. return HorRdx.tryToReduce(R, TTI);
  3718. }
  3719. bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  3720. bool Changed = false;
  3721. SmallVector<Value *, 4> Incoming;
  3722. SmallSet<Value *, 16> VisitedInstrs;
  3723. bool HaveVectorizedPhiNodes = true;
  3724. while (HaveVectorizedPhiNodes) {
  3725. HaveVectorizedPhiNodes = false;
  3726. // Collect the incoming values from the PHIs.
  3727. Incoming.clear();
  3728. for (Instruction &I : *BB) {
  3729. PHINode *P = dyn_cast<PHINode>(&I);
  3730. if (!P)
  3731. break;
  3732. if (!VisitedInstrs.count(P))
  3733. Incoming.push_back(P);
  3734. }
  3735. // Sort by type.
  3736. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
  3737. // Try to vectorize elements base on their type.
  3738. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
  3739. E = Incoming.end();
  3740. IncIt != E;) {
  3741. // Look for the next elements with the same type.
  3742. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
  3743. while (SameTypeIt != E &&
  3744. (*SameTypeIt)->getType() == (*IncIt)->getType()) {
  3745. VisitedInstrs.insert(*SameTypeIt);
  3746. ++SameTypeIt;
  3747. }
  3748. // Try to vectorize them.
  3749. unsigned NumElts = (SameTypeIt - IncIt);
  3750. DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
  3751. if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
  3752. // Success start over because instructions might have been changed.
  3753. HaveVectorizedPhiNodes = true;
  3754. Changed = true;
  3755. break;
  3756. }
  3757. // Start over at the next instruction of a different type (or the end).
  3758. IncIt = SameTypeIt;
  3759. }
  3760. }
  3761. VisitedInstrs.clear();
  3762. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
  3763. // We may go through BB multiple times so skip the one we have checked.
  3764. if (!VisitedInstrs.insert(&*it).second)
  3765. continue;
  3766. if (isa<DbgInfoIntrinsic>(it))
  3767. continue;
  3768. // Try to vectorize reductions that use PHINodes.
  3769. if (PHINode *P = dyn_cast<PHINode>(it)) {
  3770. // Check that the PHI is a reduction PHI.
  3771. if (P->getNumIncomingValues() != 2)
  3772. return Changed;
  3773. Value *Rdx = getReductionValue(DT, P, BB, LI);
  3774. // Check if this is a Binary Operator.
  3775. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
  3776. if (!BI)
  3777. continue;
  3778. // Try to match and vectorize a horizontal reduction.
  3779. if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) {
  3780. Changed = true;
  3781. it = BB->begin();
  3782. e = BB->end();
  3783. continue;
  3784. }
  3785. Value *Inst = BI->getOperand(0);
  3786. if (Inst == P)
  3787. Inst = BI->getOperand(1);
  3788. if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
  3789. // We would like to start over since some instructions are deleted
  3790. // and the iterator may become invalid value.
  3791. Changed = true;
  3792. it = BB->begin();
  3793. e = BB->end();
  3794. continue;
  3795. }
  3796. continue;
  3797. }
  3798. if (ShouldStartVectorizeHorAtStore)
  3799. if (StoreInst *SI = dyn_cast<StoreInst>(it))
  3800. if (BinaryOperator *BinOp =
  3801. dyn_cast<BinaryOperator>(SI->getValueOperand())) {
  3802. if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
  3803. MinVecRegSize) ||
  3804. tryToVectorize(BinOp, R)) {
  3805. Changed = true;
  3806. it = BB->begin();
  3807. e = BB->end();
  3808. continue;
  3809. }
  3810. }
  3811. // Try to vectorize horizontal reductions feeding into a return.
  3812. if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
  3813. if (RI->getNumOperands() != 0)
  3814. if (BinaryOperator *BinOp =
  3815. dyn_cast<BinaryOperator>(RI->getOperand(0))) {
  3816. DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
  3817. if (tryToVectorizePair(BinOp->getOperand(0),
  3818. BinOp->getOperand(1), R)) {
  3819. Changed = true;
  3820. it = BB->begin();
  3821. e = BB->end();
  3822. continue;
  3823. }
  3824. }
  3825. // Try to vectorize trees that start at compare instructions.
  3826. if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
  3827. if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
  3828. Changed = true;
  3829. // We would like to start over since some instructions are deleted
  3830. // and the iterator may become invalid value.
  3831. it = BB->begin();
  3832. e = BB->end();
  3833. continue;
  3834. }
  3835. for (int i = 0; i < 2; ++i) {
  3836. if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
  3837. if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
  3838. Changed = true;
  3839. // We would like to start over since some instructions are deleted
  3840. // and the iterator may become invalid value.
  3841. it = BB->begin();
  3842. e = BB->end();
  3843. break;
  3844. }
  3845. }
  3846. }
  3847. continue;
  3848. }
  3849. // Try to vectorize trees that start at insertelement instructions.
  3850. if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
  3851. SmallVector<Value *, 16> BuildVector;
  3852. SmallVector<Value *, 16> BuildVectorOpds;
  3853. if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
  3854. continue;
  3855. // Vectorize starting with the build vector operands ignoring the
  3856. // BuildVector instructions for the purpose of scheduling and user
  3857. // extraction.
  3858. if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
  3859. Changed = true;
  3860. it = BB->begin();
  3861. e = BB->end();
  3862. }
  3863. continue;
  3864. }
  3865. }
  3866. return Changed;
  3867. }
  3868. bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
  3869. auto Changed = false;
  3870. for (auto &Entry : GEPs) {
  3871. // If the getelementptr list has fewer than two elements, there's nothing
  3872. // to do.
  3873. if (Entry.second.size() < 2)
  3874. continue;
  3875. DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
  3876. << Entry.second.size() << ".\n");
  3877. // We process the getelementptr list in chunks of 16 (like we do for
  3878. // stores) to minimize compile-time.
  3879. for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
  3880. auto Len = std::min<unsigned>(BE - BI, 16);
  3881. auto GEPList = makeArrayRef(&Entry.second[BI], Len);
  3882. // Initialize a set a candidate getelementptrs. Note that we use a
  3883. // SetVector here to preserve program order. If the index computations
  3884. // are vectorizable and begin with loads, we want to minimize the chance
  3885. // of having to reorder them later.
  3886. SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
  3887. // Some of the candidates may have already been vectorized after we
  3888. // initially collected them. If so, the WeakVHs will have nullified the
  3889. // values, so remove them from the set of candidates.
  3890. Candidates.remove(nullptr);
  3891. // Remove from the set of candidates all pairs of getelementptrs with
  3892. // constant differences. Such getelementptrs are likely not good
  3893. // candidates for vectorization in a bottom-up phase since one can be
  3894. // computed from the other. We also ensure all candidate getelementptr
  3895. // indices are unique.
  3896. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
  3897. auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
  3898. if (!Candidates.count(GEPI))
  3899. continue;
  3900. auto *SCEVI = SE->getSCEV(GEPList[I]);
  3901. for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
  3902. auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
  3903. auto *SCEVJ = SE->getSCEV(GEPList[J]);
  3904. if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
  3905. Candidates.remove(GEPList[I]);
  3906. Candidates.remove(GEPList[J]);
  3907. } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
  3908. Candidates.remove(GEPList[J]);
  3909. }
  3910. }
  3911. }
  3912. // We break out of the above computation as soon as we know there are
  3913. // fewer than two candidates remaining.
  3914. if (Candidates.size() < 2)
  3915. continue;
  3916. // Add the single, non-constant index of each candidate to the bundle. We
  3917. // ensured the indices met these constraints when we originally collected
  3918. // the getelementptrs.
  3919. SmallVector<Value *, 16> Bundle(Candidates.size());
  3920. auto BundleIndex = 0u;
  3921. for (auto *V : Candidates) {
  3922. auto *GEP = cast<GetElementPtrInst>(V);
  3923. auto *GEPIdx = GEP->idx_begin()->get();
  3924. assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
  3925. Bundle[BundleIndex++] = GEPIdx;
  3926. }
  3927. // Try and vectorize the indices. We are currently only interested in
  3928. // gather-like cases of the form:
  3929. //
  3930. // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
  3931. //
  3932. // where the loads of "a", the loads of "b", and the subtractions can be
  3933. // performed in parallel. It's likely that detecting this pattern in a
  3934. // bottom-up phase will be simpler and less costly than building a
  3935. // full-blown top-down phase beginning at the consecutive loads.
  3936. Changed |= tryToVectorizeList(Bundle, R);
  3937. }
  3938. }
  3939. return Changed;
  3940. }
  3941. bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
  3942. bool Changed = false;
  3943. // Attempt to sort and vectorize each of the store-groups.
  3944. for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
  3945. ++it) {
  3946. if (it->second.size() < 2)
  3947. continue;
  3948. DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
  3949. << it->second.size() << ".\n");
  3950. // Process the stores in chunks of 16.
  3951. // TODO: The limit of 16 inhibits greater vectorization factors.
  3952. // For example, AVX2 supports v32i8. Increasing this limit, however,
  3953. // may cause a significant compile-time increase.
  3954. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
  3955. unsigned Len = std::min<unsigned>(CE - CI, 16);
  3956. Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
  3957. -SLPCostThreshold, R);
  3958. }
  3959. }
  3960. return Changed;
  3961. }
  3962. } // end anonymous namespace
  3963. char SLPVectorizer::ID = 0;
  3964. static const char lv_name[] = "SLP Vectorizer";
  3965. INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  3966. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  3967. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  3968. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  3969. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  3970. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  3971. INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
  3972. INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
  3973. namespace llvm {
  3974. Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
  3975. }