SLPVectorizer.cpp 222 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126
  1. //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  11. // stores that can be put together into vector-stores. Next, it attempts to
  12. // construct vectorizable tree using the use-def chains. If a profitable tree
  13. // was found, the SLP vectorizer performs vectorization on the tree.
  14. //
  15. // The pass is inspired by the work described in the paper:
  16. // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  17. //
  18. //===----------------------------------------------------------------------===//
  19. #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
  20. #include "llvm/ADT/ArrayRef.h"
  21. #include "llvm/ADT/DenseMap.h"
  22. #include "llvm/ADT/DenseSet.h"
  23. #include "llvm/ADT/MapVector.h"
  24. #include "llvm/ADT/None.h"
  25. #include "llvm/ADT/Optional.h"
  26. #include "llvm/ADT/PostOrderIterator.h"
  27. #include "llvm/ADT/STLExtras.h"
  28. #include "llvm/ADT/SetVector.h"
  29. #include "llvm/ADT/SmallPtrSet.h"
  30. #include "llvm/ADT/SmallSet.h"
  31. #include "llvm/ADT/SmallVector.h"
  32. #include "llvm/ADT/Statistic.h"
  33. #include "llvm/ADT/iterator.h"
  34. #include "llvm/ADT/iterator_range.h"
  35. #include "llvm/Analysis/AliasAnalysis.h"
  36. #include "llvm/Analysis/CodeMetrics.h"
  37. #include "llvm/Analysis/DemandedBits.h"
  38. #include "llvm/Analysis/GlobalsModRef.h"
  39. #include "llvm/Analysis/LoopAccessAnalysis.h"
  40. #include "llvm/Analysis/LoopInfo.h"
  41. #include "llvm/Analysis/MemoryLocation.h"
  42. #include "llvm/Analysis/OptimizationRemarkEmitter.h"
  43. #include "llvm/Analysis/ScalarEvolution.h"
  44. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  45. #include "llvm/Analysis/TargetLibraryInfo.h"
  46. #include "llvm/Analysis/TargetTransformInfo.h"
  47. #include "llvm/Analysis/ValueTracking.h"
  48. #include "llvm/Analysis/VectorUtils.h"
  49. #include "llvm/IR/Attributes.h"
  50. #include "llvm/IR/BasicBlock.h"
  51. #include "llvm/IR/Constant.h"
  52. #include "llvm/IR/Constants.h"
  53. #include "llvm/IR/DataLayout.h"
  54. #include "llvm/IR/DebugLoc.h"
  55. #include "llvm/IR/DerivedTypes.h"
  56. #include "llvm/IR/Dominators.h"
  57. #include "llvm/IR/Function.h"
  58. #include "llvm/IR/IRBuilder.h"
  59. #include "llvm/IR/InstrTypes.h"
  60. #include "llvm/IR/Instruction.h"
  61. #include "llvm/IR/Instructions.h"
  62. #include "llvm/IR/IntrinsicInst.h"
  63. #include "llvm/IR/Intrinsics.h"
  64. #include "llvm/IR/Module.h"
  65. #include "llvm/IR/NoFolder.h"
  66. #include "llvm/IR/Operator.h"
  67. #include "llvm/IR/PassManager.h"
  68. #include "llvm/IR/PatternMatch.h"
  69. #include "llvm/IR/Type.h"
  70. #include "llvm/IR/Use.h"
  71. #include "llvm/IR/User.h"
  72. #include "llvm/IR/Value.h"
  73. #include "llvm/IR/ValueHandle.h"
  74. #include "llvm/IR/Verifier.h"
  75. #include "llvm/Pass.h"
  76. #include "llvm/Support/Casting.h"
  77. #include "llvm/Support/CommandLine.h"
  78. #include "llvm/Support/Compiler.h"
  79. #include "llvm/Support/DOTGraphTraits.h"
  80. #include "llvm/Support/Debug.h"
  81. #include "llvm/Support/ErrorHandling.h"
  82. #include "llvm/Support/GraphWriter.h"
  83. #include "llvm/Support/KnownBits.h"
  84. #include "llvm/Support/MathExtras.h"
  85. #include "llvm/Support/raw_ostream.h"
  86. #include "llvm/Transforms/Utils/LoopUtils.h"
  87. #include "llvm/Transforms/Vectorize.h"
  88. #include <algorithm>
  89. #include <cassert>
  90. #include <cstdint>
  91. #include <iterator>
  92. #include <memory>
  93. #include <set>
  94. #include <string>
  95. #include <tuple>
  96. #include <utility>
  97. #include <vector>
  98. using namespace llvm;
  99. using namespace llvm::PatternMatch;
  100. using namespace slpvectorizer;
  101. #define SV_NAME "slp-vectorizer"
  102. #define DEBUG_TYPE "SLP"
  103. STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  104. static cl::opt<int>
  105. SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  106. cl::desc("Only vectorize if you gain more than this "
  107. "number "));
  108. static cl::opt<bool>
  109. ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
  110. cl::desc("Attempt to vectorize horizontal reductions"));
  111. static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  112. "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  113. cl::desc(
  114. "Attempt to vectorize horizontal reductions feeding into a store"));
  115. static cl::opt<int>
  116. MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
  117. cl::desc("Attempt to vectorize for this register size in bits"));
  118. /// Limits the size of scheduling regions in a block.
  119. /// It avoid long compile times for _very_ large blocks where vector
  120. /// instructions are spread over a wide range.
  121. /// This limit is way higher than needed by real-world functions.
  122. static cl::opt<int>
  123. ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
  124. cl::desc("Limit the size of the SLP scheduling region per block"));
  125. static cl::opt<int> MinVectorRegSizeOption(
  126. "slp-min-reg-size", cl::init(128), cl::Hidden,
  127. cl::desc("Attempt to vectorize for this register size in bits"));
  128. static cl::opt<unsigned> RecursionMaxDepth(
  129. "slp-recursion-max-depth", cl::init(12), cl::Hidden,
  130. cl::desc("Limit the recursion depth when building a vectorizable tree"));
  131. static cl::opt<unsigned> MinTreeSize(
  132. "slp-min-tree-size", cl::init(3), cl::Hidden,
  133. cl::desc("Only vectorize small trees if they are fully vectorizable"));
  134. static cl::opt<bool>
  135. ViewSLPTree("view-slp-tree", cl::Hidden,
  136. cl::desc("Display the SLP trees with Graphviz"));
  137. // Limit the number of alias checks. The limit is chosen so that
  138. // it has no negative effect on the llvm benchmarks.
  139. static const unsigned AliasedCheckLimit = 10;
  140. // Another limit for the alias checks: The maximum distance between load/store
  141. // instructions where alias checks are done.
  142. // This limit is useful for very large basic blocks.
  143. static const unsigned MaxMemDepDistance = 160;
  144. /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
  145. /// regions to be handled.
  146. static const int MinScheduleRegionSize = 16;
  147. /// \brief Predicate for the element types that the SLP vectorizer supports.
  148. ///
  149. /// The most important thing to filter here are types which are invalid in LLVM
  150. /// vectors. We also filter target specific types which have absolutely no
  151. /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
  152. /// avoids spending time checking the cost model and realizing that they will
  153. /// be inevitably scalarized.
  154. static bool isValidElementType(Type *Ty) {
  155. return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
  156. !Ty->isPPC_FP128Ty();
  157. }
  158. /// \returns true if all of the instructions in \p VL are in the same block or
  159. /// false otherwise.
  160. static bool allSameBlock(ArrayRef<Value *> VL) {
  161. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  162. if (!I0)
  163. return false;
  164. BasicBlock *BB = I0->getParent();
  165. for (int i = 1, e = VL.size(); i < e; i++) {
  166. Instruction *I = dyn_cast<Instruction>(VL[i]);
  167. if (!I)
  168. return false;
  169. if (BB != I->getParent())
  170. return false;
  171. }
  172. return true;
  173. }
  174. /// \returns True if all of the values in \p VL are constants.
  175. static bool allConstant(ArrayRef<Value *> VL) {
  176. for (Value *i : VL)
  177. if (!isa<Constant>(i))
  178. return false;
  179. return true;
  180. }
  181. /// \returns True if all of the values in \p VL are identical.
  182. static bool isSplat(ArrayRef<Value *> VL) {
  183. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  184. if (VL[i] != VL[0])
  185. return false;
  186. return true;
  187. }
  188. /// Checks if the vector of instructions can be represented as a shuffle, like:
  189. /// %x0 = extractelement <4 x i8> %x, i32 0
  190. /// %x3 = extractelement <4 x i8> %x, i32 3
  191. /// %y1 = extractelement <4 x i8> %y, i32 1
  192. /// %y2 = extractelement <4 x i8> %y, i32 2
  193. /// %x0x0 = mul i8 %x0, %x0
  194. /// %x3x3 = mul i8 %x3, %x3
  195. /// %y1y1 = mul i8 %y1, %y1
  196. /// %y2y2 = mul i8 %y2, %y2
  197. /// %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
  198. /// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
  199. /// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
  200. /// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
  201. /// ret <4 x i8> %ins4
  202. /// can be transformed into:
  203. /// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
  204. /// i32 6>
  205. /// %2 = mul <4 x i8> %1, %1
  206. /// ret <4 x i8> %2
  207. /// We convert this initially to something like:
  208. /// %x0 = extractelement <4 x i8> %x, i32 0
  209. /// %x3 = extractelement <4 x i8> %x, i32 3
  210. /// %y1 = extractelement <4 x i8> %y, i32 1
  211. /// %y2 = extractelement <4 x i8> %y, i32 2
  212. /// %1 = insertelement <4 x i8> undef, i8 %x0, i32 0
  213. /// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
  214. /// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
  215. /// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
  216. /// %5 = mul <4 x i8> %4, %4
  217. /// %6 = extractelement <4 x i8> %5, i32 0
  218. /// %ins1 = insertelement <4 x i8> undef, i8 %6, i32 0
  219. /// %7 = extractelement <4 x i8> %5, i32 1
  220. /// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
  221. /// %8 = extractelement <4 x i8> %5, i32 2
  222. /// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
  223. /// %9 = extractelement <4 x i8> %5, i32 3
  224. /// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
  225. /// ret <4 x i8> %ins4
  226. /// InstCombiner transforms this into a shuffle and vector mul
  227. static Optional<TargetTransformInfo::ShuffleKind>
  228. isShuffle(ArrayRef<Value *> VL) {
  229. auto *EI0 = cast<ExtractElementInst>(VL[0]);
  230. unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
  231. Value *Vec1 = nullptr;
  232. Value *Vec2 = nullptr;
  233. enum ShuffleMode {Unknown, FirstAlternate, SecondAlternate, Permute};
  234. ShuffleMode CommonShuffleMode = Unknown;
  235. for (unsigned I = 0, E = VL.size(); I < E; ++I) {
  236. auto *EI = cast<ExtractElementInst>(VL[I]);
  237. auto *Vec = EI->getVectorOperand();
  238. // All vector operands must have the same number of vector elements.
  239. if (Vec->getType()->getVectorNumElements() != Size)
  240. return None;
  241. auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
  242. if (!Idx)
  243. return None;
  244. // Undefined behavior if Idx is negative or >= Size.
  245. if (Idx->getValue().uge(Size))
  246. continue;
  247. unsigned IntIdx = Idx->getValue().getZExtValue();
  248. // We can extractelement from undef vector.
  249. if (isa<UndefValue>(Vec))
  250. continue;
  251. // For correct shuffling we have to have at most 2 different vector operands
  252. // in all extractelement instructions.
  253. if (Vec1 && Vec2 && Vec != Vec1 && Vec != Vec2)
  254. return None;
  255. if (CommonShuffleMode == Permute)
  256. continue;
  257. // If the extract index is not the same as the operation number, it is a
  258. // permutation.
  259. if (IntIdx != I) {
  260. CommonShuffleMode = Permute;
  261. continue;
  262. }
  263. // Check the shuffle mode for the current operation.
  264. if (!Vec1)
  265. Vec1 = Vec;
  266. else if (Vec != Vec1)
  267. Vec2 = Vec;
  268. // Example: shufflevector A, B, <0,5,2,7>
  269. // I is odd and IntIdx for A == I - FirstAlternate shuffle.
  270. // I is even and IntIdx for B == I - FirstAlternate shuffle.
  271. // Example: shufflevector A, B, <4,1,6,3>
  272. // I is even and IntIdx for A == I - SecondAlternate shuffle.
  273. // I is odd and IntIdx for B == I - SecondAlternate shuffle.
  274. const bool IIsEven = I & 1;
  275. const bool CurrVecIsA = Vec == Vec1;
  276. const bool IIsOdd = !IIsEven;
  277. const bool CurrVecIsB = !CurrVecIsA;
  278. ShuffleMode CurrentShuffleMode =
  279. ((IIsOdd && CurrVecIsA) || (IIsEven && CurrVecIsB)) ? FirstAlternate
  280. : SecondAlternate;
  281. // Common mode is not set or the same as the shuffle mode of the current
  282. // operation - alternate.
  283. if (CommonShuffleMode == Unknown)
  284. CommonShuffleMode = CurrentShuffleMode;
  285. // Common shuffle mode is not the same as the shuffle mode of the current
  286. // operation - permutation.
  287. if (CommonShuffleMode != CurrentShuffleMode)
  288. CommonShuffleMode = Permute;
  289. }
  290. // If we're not crossing lanes in different vectors, consider it as blending.
  291. if ((CommonShuffleMode == FirstAlternate ||
  292. CommonShuffleMode == SecondAlternate) &&
  293. Vec2)
  294. return TargetTransformInfo::SK_Alternate;
  295. // If Vec2 was never used, we have a permutation of a single vector, otherwise
  296. // we have permutation of 2 vectors.
  297. return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
  298. : TargetTransformInfo::SK_PermuteSingleSrc;
  299. }
  300. ///\returns Opcode that can be clubbed with \p Op to create an alternate
  301. /// sequence which can later be merged as a ShuffleVector instruction.
  302. static unsigned getAltOpcode(unsigned Op) {
  303. switch (Op) {
  304. case Instruction::FAdd:
  305. return Instruction::FSub;
  306. case Instruction::FSub:
  307. return Instruction::FAdd;
  308. case Instruction::Add:
  309. return Instruction::Sub;
  310. case Instruction::Sub:
  311. return Instruction::Add;
  312. default:
  313. return 0;
  314. }
  315. }
  316. static bool isOdd(unsigned Value) {
  317. return Value & 1;
  318. }
  319. static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode,
  320. unsigned CheckedOpcode) {
  321. return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode;
  322. }
  323. /// Chooses the correct key for scheduling data. If \p Op has the same (or
  324. /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
  325. /// OpValue.
  326. static Value *isOneOf(Value *OpValue, Value *Op) {
  327. auto *I = dyn_cast<Instruction>(Op);
  328. if (!I)
  329. return OpValue;
  330. auto *OpInst = cast<Instruction>(OpValue);
  331. unsigned OpInstOpcode = OpInst->getOpcode();
  332. unsigned IOpcode = I->getOpcode();
  333. if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode))
  334. return Op;
  335. return OpValue;
  336. }
  337. namespace {
  338. /// Contains data for the instructions going to be vectorized.
  339. struct RawInstructionsData {
  340. /// Main Opcode of the instructions going to be vectorized.
  341. unsigned Opcode = 0;
  342. /// The list of instructions have some instructions with alternate opcodes.
  343. bool HasAltOpcodes = false;
  344. };
  345. } // end anonymous namespace
  346. /// Checks the list of the vectorized instructions \p VL and returns info about
  347. /// this list.
  348. static RawInstructionsData getMainOpcode(ArrayRef<Value *> VL) {
  349. auto *I0 = dyn_cast<Instruction>(VL[0]);
  350. if (!I0)
  351. return {};
  352. RawInstructionsData Res;
  353. unsigned Opcode = I0->getOpcode();
  354. // Walk through the list of the vectorized instructions
  355. // in order to check its structure described by RawInstructionsData.
  356. for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) {
  357. auto *I = dyn_cast<Instruction>(VL[Cnt]);
  358. if (!I)
  359. return {};
  360. if (Opcode != I->getOpcode())
  361. Res.HasAltOpcodes = true;
  362. }
  363. Res.Opcode = Opcode;
  364. return Res;
  365. }
  366. namespace {
  367. /// Main data required for vectorization of instructions.
  368. struct InstructionsState {
  369. /// The very first instruction in the list with the main opcode.
  370. Value *OpValue = nullptr;
  371. /// The main opcode for the list of instructions.
  372. unsigned Opcode = 0;
  373. /// Some of the instructions in the list have alternate opcodes.
  374. bool IsAltShuffle = false;
  375. InstructionsState() = default;
  376. InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle)
  377. : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {}
  378. };
  379. } // end anonymous namespace
  380. /// \returns analysis of the Instructions in \p VL described in
  381. /// InstructionsState, the Opcode that we suppose the whole list
  382. /// could be vectorized even if its structure is diverse.
  383. static InstructionsState getSameOpcode(ArrayRef<Value *> VL) {
  384. auto Res = getMainOpcode(VL);
  385. unsigned Opcode = Res.Opcode;
  386. if (!Res.HasAltOpcodes)
  387. return InstructionsState(VL[0], Opcode, false);
  388. auto *OpInst = cast<Instruction>(VL[0]);
  389. unsigned AltOpcode = getAltOpcode(Opcode);
  390. // Examine each element in the list instructions VL to determine
  391. // if some operations there could be considered as an alternative
  392. // (for example as subtraction relates to addition operation).
  393. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
  394. auto *I = cast<Instruction>(VL[Cnt]);
  395. unsigned InstOpcode = I->getOpcode();
  396. if ((Res.HasAltOpcodes &&
  397. InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) ||
  398. (!Res.HasAltOpcodes && InstOpcode != Opcode)) {
  399. return InstructionsState(OpInst, 0, false);
  400. }
  401. }
  402. return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes);
  403. }
  404. /// \returns true if all of the values in \p VL have the same type or false
  405. /// otherwise.
  406. static bool allSameType(ArrayRef<Value *> VL) {
  407. Type *Ty = VL[0]->getType();
  408. for (int i = 1, e = VL.size(); i < e; i++)
  409. if (VL[i]->getType() != Ty)
  410. return false;
  411. return true;
  412. }
  413. /// \returns True if Extract{Value,Element} instruction extracts element Idx.
  414. static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
  415. assert(Opcode == Instruction::ExtractElement ||
  416. Opcode == Instruction::ExtractValue);
  417. if (Opcode == Instruction::ExtractElement) {
  418. ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
  419. return CI && CI->getZExtValue() == Idx;
  420. } else {
  421. ExtractValueInst *EI = cast<ExtractValueInst>(E);
  422. return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
  423. }
  424. }
  425. /// \returns True if in-tree use also needs extract. This refers to
  426. /// possible scalar operand in vectorized instruction.
  427. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
  428. TargetLibraryInfo *TLI) {
  429. unsigned Opcode = UserInst->getOpcode();
  430. switch (Opcode) {
  431. case Instruction::Load: {
  432. LoadInst *LI = cast<LoadInst>(UserInst);
  433. return (LI->getPointerOperand() == Scalar);
  434. }
  435. case Instruction::Store: {
  436. StoreInst *SI = cast<StoreInst>(UserInst);
  437. return (SI->getPointerOperand() == Scalar);
  438. }
  439. case Instruction::Call: {
  440. CallInst *CI = cast<CallInst>(UserInst);
  441. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  442. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  443. return (CI->getArgOperand(1) == Scalar);
  444. }
  445. LLVM_FALLTHROUGH;
  446. }
  447. default:
  448. return false;
  449. }
  450. }
  451. /// \returns the AA location that is being access by the instruction.
  452. static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
  453. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  454. return MemoryLocation::get(SI);
  455. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  456. return MemoryLocation::get(LI);
  457. return MemoryLocation();
  458. }
  459. /// \returns True if the instruction is not a volatile or atomic load/store.
  460. static bool isSimple(Instruction *I) {
  461. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  462. return LI->isSimple();
  463. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  464. return SI->isSimple();
  465. if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
  466. return !MI->isVolatile();
  467. return true;
  468. }
  469. namespace llvm {
  470. namespace slpvectorizer {
  471. /// Bottom Up SLP Vectorizer.
  472. class BoUpSLP {
  473. public:
  474. using ValueList = SmallVector<Value *, 8>;
  475. using InstrList = SmallVector<Instruction *, 16>;
  476. using ValueSet = SmallPtrSet<Value *, 16>;
  477. using StoreList = SmallVector<StoreInst *, 8>;
  478. using ExtraValueToDebugLocsMap =
  479. MapVector<Value *, SmallVector<Instruction *, 2>>;
  480. BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
  481. TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
  482. DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
  483. const DataLayout *DL, OptimizationRemarkEmitter *ORE)
  484. : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
  485. DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
  486. CodeMetrics::collectEphemeralValues(F, AC, EphValues);
  487. // Use the vector register size specified by the target unless overridden
  488. // by a command-line option.
  489. // TODO: It would be better to limit the vectorization factor based on
  490. // data type rather than just register size. For example, x86 AVX has
  491. // 256-bit registers, but it does not support integer operations
  492. // at that width (that requires AVX2).
  493. if (MaxVectorRegSizeOption.getNumOccurrences())
  494. MaxVecRegSize = MaxVectorRegSizeOption;
  495. else
  496. MaxVecRegSize = TTI->getRegisterBitWidth(true);
  497. if (MinVectorRegSizeOption.getNumOccurrences())
  498. MinVecRegSize = MinVectorRegSizeOption;
  499. else
  500. MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
  501. }
  502. /// \brief Vectorize the tree that starts with the elements in \p VL.
  503. /// Returns the vectorized root.
  504. Value *vectorizeTree();
  505. /// Vectorize the tree but with the list of externally used values \p
  506. /// ExternallyUsedValues. Values in this MapVector can be replaced but the
  507. /// generated extractvalue instructions.
  508. Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
  509. /// \returns the cost incurred by unwanted spills and fills, caused by
  510. /// holding live values over call sites.
  511. int getSpillCost();
  512. /// \returns the vectorization cost of the subtree that starts at \p VL.
  513. /// A negative number means that this is profitable.
  514. int getTreeCost();
  515. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  516. /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
  517. void buildTree(ArrayRef<Value *> Roots,
  518. ArrayRef<Value *> UserIgnoreLst = None);
  519. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  520. /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
  521. /// into account (anf updating it, if required) list of externally used
  522. /// values stored in \p ExternallyUsedValues.
  523. void buildTree(ArrayRef<Value *> Roots,
  524. ExtraValueToDebugLocsMap &ExternallyUsedValues,
  525. ArrayRef<Value *> UserIgnoreLst = None);
  526. /// Clear the internal data structures that are created by 'buildTree'.
  527. void deleteTree() {
  528. VectorizableTree.clear();
  529. ScalarToTreeEntry.clear();
  530. MustGather.clear();
  531. ExternalUses.clear();
  532. NumLoadsWantToKeepOrder = 0;
  533. NumLoadsWantToChangeOrder = 0;
  534. for (auto &Iter : BlocksSchedules) {
  535. BlockScheduling *BS = Iter.second.get();
  536. BS->clear();
  537. }
  538. MinBWs.clear();
  539. }
  540. unsigned getTreeSize() const { return VectorizableTree.size(); }
  541. /// \brief Perform LICM and CSE on the newly generated gather sequences.
  542. void optimizeGatherSequence(Function &F);
  543. /// \returns true if it is beneficial to reverse the vector order.
  544. bool shouldReorder() const {
  545. return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
  546. }
  547. /// \return The vector element size in bits to use when vectorizing the
  548. /// expression tree ending at \p V. If V is a store, the size is the width of
  549. /// the stored value. Otherwise, the size is the width of the largest loaded
  550. /// value reaching V. This method is used by the vectorizer to calculate
  551. /// vectorization factors.
  552. unsigned getVectorElementSize(Value *V);
  553. /// Compute the minimum type sizes required to represent the entries in a
  554. /// vectorizable tree.
  555. void computeMinimumValueSizes();
  556. // \returns maximum vector register size as set by TTI or overridden by cl::opt.
  557. unsigned getMaxVecRegSize() const {
  558. return MaxVecRegSize;
  559. }
  560. // \returns minimum vector register size as set by cl::opt.
  561. unsigned getMinVecRegSize() const {
  562. return MinVecRegSize;
  563. }
  564. /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
  565. ///
  566. /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
  567. unsigned canMapToVector(Type *T, const DataLayout &DL) const;
  568. /// \returns True if the VectorizableTree is both tiny and not fully
  569. /// vectorizable. We do not vectorize such trees.
  570. bool isTreeTinyAndNotFullyVectorizable();
  571. OptimizationRemarkEmitter *getORE() { return ORE; }
  572. private:
  573. struct TreeEntry;
  574. /// Checks if all users of \p I are the part of the vectorization tree.
  575. bool areAllUsersVectorized(Instruction *I) const;
  576. /// \returns the cost of the vectorizable entry.
  577. int getEntryCost(TreeEntry *E);
  578. /// This is the recursive part of buildTree.
  579. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int UserIndx = -1,
  580. int OpdNum = 0);
  581. /// \returns True if the ExtractElement/ExtractValue instructions in VL can
  582. /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
  583. bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
  584. /// Vectorize a single entry in the tree.\p OpdNum indicate the ordinality of
  585. /// operand corrsponding to this tree entry \p E for the user tree entry
  586. /// indicated by \p UserIndx.
  587. // In other words, "E == TreeEntry[UserIndx].getOperand(OpdNum)".
  588. Value *vectorizeTree(TreeEntry *E, int OpdNum = 0, int UserIndx = -1);
  589. /// Vectorize a single entry in the tree, starting in \p VL.\p OpdNum indicate
  590. /// the ordinality of operand corrsponding to the \p VL of scalar values for the
  591. /// user indicated by \p UserIndx this \p VL feeds into.
  592. Value *vectorizeTree(ArrayRef<Value *> VL, int OpdNum = 0, int UserIndx = -1);
  593. /// \returns the pointer to the vectorized value if \p VL is already
  594. /// vectorized, or NULL. They may happen in cycles.
  595. Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
  596. /// \returns the scalarization cost for this type. Scalarization in this
  597. /// context means the creation of vectors from a group of scalars.
  598. int getGatherCost(Type *Ty);
  599. /// \returns the scalarization cost for this list of values. Assuming that
  600. /// this subtree gets vectorized, we may need to extract the values from the
  601. /// roots. This method calculates the cost of extracting the values.
  602. int getGatherCost(ArrayRef<Value *> VL);
  603. /// \brief Set the Builder insert point to one after the last instruction in
  604. /// the bundle
  605. void setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue);
  606. /// \returns a vector from a collection of scalars in \p VL.
  607. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
  608. /// \returns whether the VectorizableTree is fully vectorizable and will
  609. /// be beneficial even the tree height is tiny.
  610. bool isFullyVectorizableTinyTree();
  611. /// \reorder commutative operands in alt shuffle if they result in
  612. /// vectorized code.
  613. void reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
  614. SmallVectorImpl<Value *> &Left,
  615. SmallVectorImpl<Value *> &Right);
  616. /// \reorder commutative operands to get better probability of
  617. /// generating vectorized code.
  618. void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef<Value *> VL,
  619. SmallVectorImpl<Value *> &Left,
  620. SmallVectorImpl<Value *> &Right);
  621. struct TreeEntry {
  622. TreeEntry(std::vector<TreeEntry> &Container) : Container(Container) {}
  623. /// \returns true if the scalars in VL are equal to this entry.
  624. bool isSame(ArrayRef<Value *> VL) const {
  625. assert(VL.size() == Scalars.size() && "Invalid size");
  626. return std::equal(VL.begin(), VL.end(), Scalars.begin());
  627. }
  628. /// \returns true if the scalars in VL are found in this tree entry.
  629. bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
  630. ScalarEvolution &SE) const {
  631. assert(VL.size() == Scalars.size() && "Invalid size");
  632. SmallVector<Value *, 8> List;
  633. if (!sortLoadAccesses(VL, DL, SE, List))
  634. return false;
  635. return std::equal(List.begin(), List.end(), Scalars.begin());
  636. }
  637. /// A vector of scalars.
  638. ValueList Scalars;
  639. /// The Scalars are vectorized into this value. It is initialized to Null.
  640. Value *VectorizedValue = nullptr;
  641. /// Do we need to gather this sequence ?
  642. bool NeedToGather = false;
  643. /// Records optional shuffle mask for the uses of jumbled memory accesses.
  644. /// For example, a non-empty ShuffleMask[1] represents the permutation of
  645. /// lanes that operand #1 of this vectorized instruction should undergo
  646. /// before feeding this vectorized instruction, whereas an empty
  647. /// ShuffleMask[0] indicates that the lanes of operand #0 of this vectorized
  648. /// instruction need not be permuted at all.
  649. SmallVector<SmallVector<unsigned, 4>, 2> ShuffleMask;
  650. /// Points back to the VectorizableTree.
  651. ///
  652. /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
  653. /// to be a pointer and needs to be able to initialize the child iterator.
  654. /// Thus we need a reference back to the container to translate the indices
  655. /// to entries.
  656. std::vector<TreeEntry> &Container;
  657. /// The TreeEntry index containing the user of this entry. We can actually
  658. /// have multiple users so the data structure is not truly a tree.
  659. SmallVector<int, 1> UserTreeIndices;
  660. };
  661. /// Create a new VectorizableTree entry.
  662. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
  663. int &UserTreeIdx, const InstructionsState &S,
  664. ArrayRef<unsigned> ShuffleMask = None,
  665. int OpdNum = 0) {
  666. assert((!Vectorized || S.Opcode != 0) &&
  667. "Vectorized TreeEntry without opcode");
  668. VectorizableTree.emplace_back(VectorizableTree);
  669. int idx = VectorizableTree.size() - 1;
  670. TreeEntry *Last = &VectorizableTree[idx];
  671. Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
  672. Last->NeedToGather = !Vectorized;
  673. TreeEntry *UserTreeEntry = nullptr;
  674. if (UserTreeIdx != -1)
  675. UserTreeEntry = &VectorizableTree[UserTreeIdx];
  676. if (UserTreeEntry && !ShuffleMask.empty()) {
  677. if ((unsigned)OpdNum >= UserTreeEntry->ShuffleMask.size())
  678. UserTreeEntry->ShuffleMask.resize(OpdNum + 1);
  679. assert(UserTreeEntry->ShuffleMask[OpdNum].empty() &&
  680. "Mask already present");
  681. using mask = SmallVector<unsigned, 4>;
  682. mask tempMask(ShuffleMask.begin(), ShuffleMask.end());
  683. UserTreeEntry->ShuffleMask[OpdNum] = tempMask;
  684. }
  685. if (Vectorized) {
  686. for (int i = 0, e = VL.size(); i != e; ++i) {
  687. assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
  688. ScalarToTreeEntry[VL[i]] = idx;
  689. }
  690. } else {
  691. MustGather.insert(VL.begin(), VL.end());
  692. }
  693. if (UserTreeIdx >= 0)
  694. Last->UserTreeIndices.push_back(UserTreeIdx);
  695. UserTreeIdx = idx;
  696. return Last;
  697. }
  698. /// -- Vectorization State --
  699. /// Holds all of the tree entries.
  700. std::vector<TreeEntry> VectorizableTree;
  701. TreeEntry *getTreeEntry(Value *V) {
  702. auto I = ScalarToTreeEntry.find(V);
  703. if (I != ScalarToTreeEntry.end())
  704. return &VectorizableTree[I->second];
  705. return nullptr;
  706. }
  707. const TreeEntry *getTreeEntry(Value *V) const {
  708. auto I = ScalarToTreeEntry.find(V);
  709. if (I != ScalarToTreeEntry.end())
  710. return &VectorizableTree[I->second];
  711. return nullptr;
  712. }
  713. /// Maps a specific scalar to its tree entry.
  714. SmallDenseMap<Value*, int> ScalarToTreeEntry;
  715. /// A list of scalars that we found that we need to keep as scalars.
  716. ValueSet MustGather;
  717. /// This POD struct describes one external user in the vectorized tree.
  718. struct ExternalUser {
  719. ExternalUser(Value *S, llvm::User *U, int L)
  720. : Scalar(S), User(U), Lane(L) {}
  721. // Which scalar in our function.
  722. Value *Scalar;
  723. // Which user that uses the scalar.
  724. llvm::User *User;
  725. // Which lane does the scalar belong to.
  726. int Lane;
  727. };
  728. using UserList = SmallVector<ExternalUser, 16>;
  729. /// Checks if two instructions may access the same memory.
  730. ///
  731. /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
  732. /// is invariant in the calling loop.
  733. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
  734. Instruction *Inst2) {
  735. // First check if the result is already in the cache.
  736. AliasCacheKey key = std::make_pair(Inst1, Inst2);
  737. Optional<bool> &result = AliasCache[key];
  738. if (result.hasValue()) {
  739. return result.getValue();
  740. }
  741. MemoryLocation Loc2 = getLocation(Inst2, AA);
  742. bool aliased = true;
  743. if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
  744. // Do the alias check.
  745. aliased = AA->alias(Loc1, Loc2);
  746. }
  747. // Store the result in the cache.
  748. result = aliased;
  749. return aliased;
  750. }
  751. using AliasCacheKey = std::pair<Instruction *, Instruction *>;
  752. /// Cache for alias results.
  753. /// TODO: consider moving this to the AliasAnalysis itself.
  754. DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
  755. /// Removes an instruction from its block and eventually deletes it.
  756. /// It's like Instruction::eraseFromParent() except that the actual deletion
  757. /// is delayed until BoUpSLP is destructed.
  758. /// This is required to ensure that there are no incorrect collisions in the
  759. /// AliasCache, which can happen if a new instruction is allocated at the
  760. /// same address as a previously deleted instruction.
  761. void eraseInstruction(Instruction *I) {
  762. I->removeFromParent();
  763. I->dropAllReferences();
  764. DeletedInstructions.emplace_back(I);
  765. }
  766. /// Temporary store for deleted instructions. Instructions will be deleted
  767. /// eventually when the BoUpSLP is destructed.
  768. SmallVector<unique_value, 8> DeletedInstructions;
  769. /// A list of values that need to extracted out of the tree.
  770. /// This list holds pairs of (Internal Scalar : External User). External User
  771. /// can be nullptr, it means that this Internal Scalar will be used later,
  772. /// after vectorization.
  773. UserList ExternalUses;
  774. /// Values used only by @llvm.assume calls.
  775. SmallPtrSet<const Value *, 32> EphValues;
  776. /// Holds all of the instructions that we gathered.
  777. SetVector<Instruction *> GatherSeq;
  778. /// A list of blocks that we are going to CSE.
  779. SetVector<BasicBlock *> CSEBlocks;
  780. /// Contains all scheduling relevant data for an instruction.
  781. /// A ScheduleData either represents a single instruction or a member of an
  782. /// instruction bundle (= a group of instructions which is combined into a
  783. /// vector instruction).
  784. struct ScheduleData {
  785. // The initial value for the dependency counters. It means that the
  786. // dependencies are not calculated yet.
  787. enum { InvalidDeps = -1 };
  788. ScheduleData() = default;
  789. void init(int BlockSchedulingRegionID, Value *OpVal) {
  790. FirstInBundle = this;
  791. NextInBundle = nullptr;
  792. NextLoadStore = nullptr;
  793. IsScheduled = false;
  794. SchedulingRegionID = BlockSchedulingRegionID;
  795. UnscheduledDepsInBundle = UnscheduledDeps;
  796. clearDependencies();
  797. OpValue = OpVal;
  798. }
  799. /// Returns true if the dependency information has been calculated.
  800. bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
  801. /// Returns true for single instructions and for bundle representatives
  802. /// (= the head of a bundle).
  803. bool isSchedulingEntity() const { return FirstInBundle == this; }
  804. /// Returns true if it represents an instruction bundle and not only a
  805. /// single instruction.
  806. bool isPartOfBundle() const {
  807. return NextInBundle != nullptr || FirstInBundle != this;
  808. }
  809. /// Returns true if it is ready for scheduling, i.e. it has no more
  810. /// unscheduled depending instructions/bundles.
  811. bool isReady() const {
  812. assert(isSchedulingEntity() &&
  813. "can't consider non-scheduling entity for ready list");
  814. return UnscheduledDepsInBundle == 0 && !IsScheduled;
  815. }
  816. /// Modifies the number of unscheduled dependencies, also updating it for
  817. /// the whole bundle.
  818. int incrementUnscheduledDeps(int Incr) {
  819. UnscheduledDeps += Incr;
  820. return FirstInBundle->UnscheduledDepsInBundle += Incr;
  821. }
  822. /// Sets the number of unscheduled dependencies to the number of
  823. /// dependencies.
  824. void resetUnscheduledDeps() {
  825. incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
  826. }
  827. /// Clears all dependency information.
  828. void clearDependencies() {
  829. Dependencies = InvalidDeps;
  830. resetUnscheduledDeps();
  831. MemoryDependencies.clear();
  832. }
  833. void dump(raw_ostream &os) const {
  834. if (!isSchedulingEntity()) {
  835. os << "/ " << *Inst;
  836. } else if (NextInBundle) {
  837. os << '[' << *Inst;
  838. ScheduleData *SD = NextInBundle;
  839. while (SD) {
  840. os << ';' << *SD->Inst;
  841. SD = SD->NextInBundle;
  842. }
  843. os << ']';
  844. } else {
  845. os << *Inst;
  846. }
  847. }
  848. Instruction *Inst = nullptr;
  849. /// Points to the head in an instruction bundle (and always to this for
  850. /// single instructions).
  851. ScheduleData *FirstInBundle = nullptr;
  852. /// Single linked list of all instructions in a bundle. Null if it is a
  853. /// single instruction.
  854. ScheduleData *NextInBundle = nullptr;
  855. /// Single linked list of all memory instructions (e.g. load, store, call)
  856. /// in the block - until the end of the scheduling region.
  857. ScheduleData *NextLoadStore = nullptr;
  858. /// The dependent memory instructions.
  859. /// This list is derived on demand in calculateDependencies().
  860. SmallVector<ScheduleData *, 4> MemoryDependencies;
  861. /// This ScheduleData is in the current scheduling region if this matches
  862. /// the current SchedulingRegionID of BlockScheduling.
  863. int SchedulingRegionID = 0;
  864. /// Used for getting a "good" final ordering of instructions.
  865. int SchedulingPriority = 0;
  866. /// The number of dependencies. Constitutes of the number of users of the
  867. /// instruction plus the number of dependent memory instructions (if any).
  868. /// This value is calculated on demand.
  869. /// If InvalidDeps, the number of dependencies is not calculated yet.
  870. int Dependencies = InvalidDeps;
  871. /// The number of dependencies minus the number of dependencies of scheduled
  872. /// instructions. As soon as this is zero, the instruction/bundle gets ready
  873. /// for scheduling.
  874. /// Note that this is negative as long as Dependencies is not calculated.
  875. int UnscheduledDeps = InvalidDeps;
  876. /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
  877. /// single instructions.
  878. int UnscheduledDepsInBundle = InvalidDeps;
  879. /// True if this instruction is scheduled (or considered as scheduled in the
  880. /// dry-run).
  881. bool IsScheduled = false;
  882. /// Opcode of the current instruction in the schedule data.
  883. Value *OpValue = nullptr;
  884. };
  885. #ifndef NDEBUG
  886. friend inline raw_ostream &operator<<(raw_ostream &os,
  887. const BoUpSLP::ScheduleData &SD) {
  888. SD.dump(os);
  889. return os;
  890. }
  891. #endif
  892. friend struct GraphTraits<BoUpSLP *>;
  893. friend struct DOTGraphTraits<BoUpSLP *>;
  894. /// Contains all scheduling data for a basic block.
  895. struct BlockScheduling {
  896. BlockScheduling(BasicBlock *BB)
  897. : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
  898. void clear() {
  899. ReadyInsts.clear();
  900. ScheduleStart = nullptr;
  901. ScheduleEnd = nullptr;
  902. FirstLoadStoreInRegion = nullptr;
  903. LastLoadStoreInRegion = nullptr;
  904. // Reduce the maximum schedule region size by the size of the
  905. // previous scheduling run.
  906. ScheduleRegionSizeLimit -= ScheduleRegionSize;
  907. if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
  908. ScheduleRegionSizeLimit = MinScheduleRegionSize;
  909. ScheduleRegionSize = 0;
  910. // Make a new scheduling region, i.e. all existing ScheduleData is not
  911. // in the new region yet.
  912. ++SchedulingRegionID;
  913. }
  914. ScheduleData *getScheduleData(Value *V) {
  915. ScheduleData *SD = ScheduleDataMap[V];
  916. if (SD && SD->SchedulingRegionID == SchedulingRegionID)
  917. return SD;
  918. return nullptr;
  919. }
  920. ScheduleData *getScheduleData(Value *V, Value *Key) {
  921. if (V == Key)
  922. return getScheduleData(V);
  923. auto I = ExtraScheduleDataMap.find(V);
  924. if (I != ExtraScheduleDataMap.end()) {
  925. ScheduleData *SD = I->second[Key];
  926. if (SD && SD->SchedulingRegionID == SchedulingRegionID)
  927. return SD;
  928. }
  929. return nullptr;
  930. }
  931. bool isInSchedulingRegion(ScheduleData *SD) {
  932. return SD->SchedulingRegionID == SchedulingRegionID;
  933. }
  934. /// Marks an instruction as scheduled and puts all dependent ready
  935. /// instructions into the ready-list.
  936. template <typename ReadyListType>
  937. void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
  938. SD->IsScheduled = true;
  939. DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
  940. ScheduleData *BundleMember = SD;
  941. while (BundleMember) {
  942. if (BundleMember->Inst != BundleMember->OpValue) {
  943. BundleMember = BundleMember->NextInBundle;
  944. continue;
  945. }
  946. // Handle the def-use chain dependencies.
  947. for (Use &U : BundleMember->Inst->operands()) {
  948. auto *I = dyn_cast<Instruction>(U.get());
  949. if (!I)
  950. continue;
  951. doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
  952. if (OpDef && OpDef->hasValidDependencies() &&
  953. OpDef->incrementUnscheduledDeps(-1) == 0) {
  954. // There are no more unscheduled dependencies after
  955. // decrementing, so we can put the dependent instruction
  956. // into the ready list.
  957. ScheduleData *DepBundle = OpDef->FirstInBundle;
  958. assert(!DepBundle->IsScheduled &&
  959. "already scheduled bundle gets ready");
  960. ReadyList.insert(DepBundle);
  961. DEBUG(dbgs()
  962. << "SLP: gets ready (def): " << *DepBundle << "\n");
  963. }
  964. });
  965. }
  966. // Handle the memory dependencies.
  967. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
  968. if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
  969. // There are no more unscheduled dependencies after decrementing,
  970. // so we can put the dependent instruction into the ready list.
  971. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
  972. assert(!DepBundle->IsScheduled &&
  973. "already scheduled bundle gets ready");
  974. ReadyList.insert(DepBundle);
  975. DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle
  976. << "\n");
  977. }
  978. }
  979. BundleMember = BundleMember->NextInBundle;
  980. }
  981. }
  982. void doForAllOpcodes(Value *V,
  983. function_ref<void(ScheduleData *SD)> Action) {
  984. if (ScheduleData *SD = getScheduleData(V))
  985. Action(SD);
  986. auto I = ExtraScheduleDataMap.find(V);
  987. if (I != ExtraScheduleDataMap.end())
  988. for (auto &P : I->second)
  989. if (P.second->SchedulingRegionID == SchedulingRegionID)
  990. Action(P.second);
  991. }
  992. /// Put all instructions into the ReadyList which are ready for scheduling.
  993. template <typename ReadyListType>
  994. void initialFillReadyList(ReadyListType &ReadyList) {
  995. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  996. doForAllOpcodes(I, [&](ScheduleData *SD) {
  997. if (SD->isSchedulingEntity() && SD->isReady()) {
  998. ReadyList.insert(SD);
  999. DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
  1000. }
  1001. });
  1002. }
  1003. }
  1004. /// Checks if a bundle of instructions can be scheduled, i.e. has no
  1005. /// cyclic dependencies. This is only a dry-run, no instructions are
  1006. /// actually moved at this stage.
  1007. bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
  1008. /// Un-bundles a group of instructions.
  1009. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
  1010. /// Allocates schedule data chunk.
  1011. ScheduleData *allocateScheduleDataChunks();
  1012. /// Extends the scheduling region so that V is inside the region.
  1013. /// \returns true if the region size is within the limit.
  1014. bool extendSchedulingRegion(Value *V, Value *OpValue);
  1015. /// Initialize the ScheduleData structures for new instructions in the
  1016. /// scheduling region.
  1017. void initScheduleData(Instruction *FromI, Instruction *ToI,
  1018. ScheduleData *PrevLoadStore,
  1019. ScheduleData *NextLoadStore);
  1020. /// Updates the dependency information of a bundle and of all instructions/
  1021. /// bundles which depend on the original bundle.
  1022. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
  1023. BoUpSLP *SLP);
  1024. /// Sets all instruction in the scheduling region to un-scheduled.
  1025. void resetSchedule();
  1026. BasicBlock *BB;
  1027. /// Simple memory allocation for ScheduleData.
  1028. std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
  1029. /// The size of a ScheduleData array in ScheduleDataChunks.
  1030. int ChunkSize;
  1031. /// The allocator position in the current chunk, which is the last entry
  1032. /// of ScheduleDataChunks.
  1033. int ChunkPos;
  1034. /// Attaches ScheduleData to Instruction.
  1035. /// Note that the mapping survives during all vectorization iterations, i.e.
  1036. /// ScheduleData structures are recycled.
  1037. DenseMap<Value *, ScheduleData *> ScheduleDataMap;
  1038. /// Attaches ScheduleData to Instruction with the leading key.
  1039. DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
  1040. ExtraScheduleDataMap;
  1041. struct ReadyList : SmallVector<ScheduleData *, 8> {
  1042. void insert(ScheduleData *SD) { push_back(SD); }
  1043. };
  1044. /// The ready-list for scheduling (only used for the dry-run).
  1045. ReadyList ReadyInsts;
  1046. /// The first instruction of the scheduling region.
  1047. Instruction *ScheduleStart = nullptr;
  1048. /// The first instruction _after_ the scheduling region.
  1049. Instruction *ScheduleEnd = nullptr;
  1050. /// The first memory accessing instruction in the scheduling region
  1051. /// (can be null).
  1052. ScheduleData *FirstLoadStoreInRegion = nullptr;
  1053. /// The last memory accessing instruction in the scheduling region
  1054. /// (can be null).
  1055. ScheduleData *LastLoadStoreInRegion = nullptr;
  1056. /// The current size of the scheduling region.
  1057. int ScheduleRegionSize = 0;
  1058. /// The maximum size allowed for the scheduling region.
  1059. int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
  1060. /// The ID of the scheduling region. For a new vectorization iteration this
  1061. /// is incremented which "removes" all ScheduleData from the region.
  1062. // Make sure that the initial SchedulingRegionID is greater than the
  1063. // initial SchedulingRegionID in ScheduleData (which is 0).
  1064. int SchedulingRegionID = 1;
  1065. };
  1066. /// Attaches the BlockScheduling structures to basic blocks.
  1067. MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
  1068. /// Performs the "real" scheduling. Done before vectorization is actually
  1069. /// performed in a basic block.
  1070. void scheduleBlock(BlockScheduling *BS);
  1071. /// List of users to ignore during scheduling and that don't need extracting.
  1072. ArrayRef<Value *> UserIgnoreList;
  1073. // Number of load bundles that contain consecutive loads.
  1074. int NumLoadsWantToKeepOrder = 0;
  1075. // Number of load bundles that contain consecutive loads in reversed order.
  1076. int NumLoadsWantToChangeOrder = 0;
  1077. // Analysis and block reference.
  1078. Function *F;
  1079. ScalarEvolution *SE;
  1080. TargetTransformInfo *TTI;
  1081. TargetLibraryInfo *TLI;
  1082. AliasAnalysis *AA;
  1083. LoopInfo *LI;
  1084. DominatorTree *DT;
  1085. AssumptionCache *AC;
  1086. DemandedBits *DB;
  1087. const DataLayout *DL;
  1088. OptimizationRemarkEmitter *ORE;
  1089. unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
  1090. unsigned MinVecRegSize; // Set by cl::opt (default: 128).
  1091. /// Instruction builder to construct the vectorized tree.
  1092. IRBuilder<> Builder;
  1093. /// A map of scalar integer values to the smallest bit width with which they
  1094. /// can legally be represented. The values map to (width, signed) pairs,
  1095. /// where "width" indicates the minimum bit width and "signed" is True if the
  1096. /// value must be signed-extended, rather than zero-extended, back to its
  1097. /// original width.
  1098. MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
  1099. };
  1100. } // end namespace slpvectorizer
  1101. template <> struct GraphTraits<BoUpSLP *> {
  1102. using TreeEntry = BoUpSLP::TreeEntry;
  1103. /// NodeRef has to be a pointer per the GraphWriter.
  1104. using NodeRef = TreeEntry *;
  1105. /// \brief Add the VectorizableTree to the index iterator to be able to return
  1106. /// TreeEntry pointers.
  1107. struct ChildIteratorType
  1108. : public iterator_adaptor_base<ChildIteratorType,
  1109. SmallVector<int, 1>::iterator> {
  1110. std::vector<TreeEntry> &VectorizableTree;
  1111. ChildIteratorType(SmallVector<int, 1>::iterator W,
  1112. std::vector<TreeEntry> &VT)
  1113. : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
  1114. NodeRef operator*() { return &VectorizableTree[*I]; }
  1115. };
  1116. static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
  1117. static ChildIteratorType child_begin(NodeRef N) {
  1118. return {N->UserTreeIndices.begin(), N->Container};
  1119. }
  1120. static ChildIteratorType child_end(NodeRef N) {
  1121. return {N->UserTreeIndices.end(), N->Container};
  1122. }
  1123. /// For the node iterator we just need to turn the TreeEntry iterator into a
  1124. /// TreeEntry* iterator so that it dereferences to NodeRef.
  1125. using nodes_iterator = pointer_iterator<std::vector<TreeEntry>::iterator>;
  1126. static nodes_iterator nodes_begin(BoUpSLP *R) {
  1127. return nodes_iterator(R->VectorizableTree.begin());
  1128. }
  1129. static nodes_iterator nodes_end(BoUpSLP *R) {
  1130. return nodes_iterator(R->VectorizableTree.end());
  1131. }
  1132. static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
  1133. };
  1134. template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
  1135. using TreeEntry = BoUpSLP::TreeEntry;
  1136. DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
  1137. std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
  1138. std::string Str;
  1139. raw_string_ostream OS(Str);
  1140. if (isSplat(Entry->Scalars)) {
  1141. OS << "<splat> " << *Entry->Scalars[0];
  1142. return Str;
  1143. }
  1144. for (auto V : Entry->Scalars) {
  1145. OS << *V;
  1146. if (std::any_of(
  1147. R->ExternalUses.begin(), R->ExternalUses.end(),
  1148. [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
  1149. OS << " <extract>";
  1150. OS << "\n";
  1151. }
  1152. return Str;
  1153. }
  1154. static std::string getNodeAttributes(const TreeEntry *Entry,
  1155. const BoUpSLP *) {
  1156. if (Entry->NeedToGather)
  1157. return "color=red";
  1158. return "";
  1159. }
  1160. };
  1161. } // end namespace llvm
  1162. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  1163. ArrayRef<Value *> UserIgnoreLst) {
  1164. ExtraValueToDebugLocsMap ExternallyUsedValues;
  1165. buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
  1166. }
  1167. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  1168. ExtraValueToDebugLocsMap &ExternallyUsedValues,
  1169. ArrayRef<Value *> UserIgnoreLst) {
  1170. deleteTree();
  1171. UserIgnoreList = UserIgnoreLst;
  1172. if (!allSameType(Roots))
  1173. return;
  1174. buildTree_rec(Roots, 0, -1);
  1175. // Collect the values that we need to extract from the tree.
  1176. for (TreeEntry &EIdx : VectorizableTree) {
  1177. TreeEntry *Entry = &EIdx;
  1178. // No need to handle users of gathered values.
  1179. if (Entry->NeedToGather)
  1180. continue;
  1181. // For each lane:
  1182. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  1183. Value *Scalar = Entry->Scalars[Lane];
  1184. // Check if the scalar is externally used as an extra arg.
  1185. auto ExtI = ExternallyUsedValues.find(Scalar);
  1186. if (ExtI != ExternallyUsedValues.end()) {
  1187. DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
  1188. Lane << " from " << *Scalar << ".\n");
  1189. ExternalUses.emplace_back(Scalar, nullptr, Lane);
  1190. continue;
  1191. }
  1192. for (User *U : Scalar->users()) {
  1193. DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
  1194. Instruction *UserInst = dyn_cast<Instruction>(U);
  1195. if (!UserInst)
  1196. continue;
  1197. // Skip in-tree scalars that become vectors
  1198. if (TreeEntry *UseEntry = getTreeEntry(U)) {
  1199. Value *UseScalar = UseEntry->Scalars[0];
  1200. // Some in-tree scalars will remain as scalar in vectorized
  1201. // instructions. If that is the case, the one in Lane 0 will
  1202. // be used.
  1203. if (UseScalar != U ||
  1204. !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
  1205. DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
  1206. << ".\n");
  1207. assert(!UseEntry->NeedToGather && "Bad state");
  1208. continue;
  1209. }
  1210. }
  1211. // Ignore users in the user ignore list.
  1212. if (is_contained(UserIgnoreList, UserInst))
  1213. continue;
  1214. DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
  1215. Lane << " from " << *Scalar << ".\n");
  1216. ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
  1217. }
  1218. }
  1219. }
  1220. }
  1221. void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
  1222. int UserTreeIdx, int OpdNum) {
  1223. assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
  1224. InstructionsState S = getSameOpcode(VL);
  1225. if (Depth == RecursionMaxDepth) {
  1226. DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
  1227. newTreeEntry(VL, false, UserTreeIdx, S);
  1228. return;
  1229. }
  1230. // Don't handle vectors.
  1231. if (S.OpValue->getType()->isVectorTy()) {
  1232. DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
  1233. newTreeEntry(VL, false, UserTreeIdx, S);
  1234. return;
  1235. }
  1236. if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
  1237. if (SI->getValueOperand()->getType()->isVectorTy()) {
  1238. DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
  1239. newTreeEntry(VL, false, UserTreeIdx, S);
  1240. return;
  1241. }
  1242. // If all of the operands are identical or constant we have a simple solution.
  1243. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) {
  1244. DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
  1245. newTreeEntry(VL, false, UserTreeIdx, S);
  1246. return;
  1247. }
  1248. // We now know that this is a vector of instructions of the same type from
  1249. // the same block.
  1250. // Don't vectorize ephemeral values.
  1251. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1252. if (EphValues.count(VL[i])) {
  1253. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  1254. ") is ephemeral.\n");
  1255. newTreeEntry(VL, false, UserTreeIdx, S);
  1256. return;
  1257. }
  1258. }
  1259. // Check if this is a duplicate of another entry.
  1260. if (TreeEntry *E = getTreeEntry(S.OpValue)) {
  1261. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1262. DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
  1263. if (E->Scalars[i] != VL[i]) {
  1264. DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
  1265. newTreeEntry(VL, false, UserTreeIdx, S);
  1266. return;
  1267. }
  1268. }
  1269. // Record the reuse of the tree node. FIXME, currently this is only used to
  1270. // properly draw the graph rather than for the actual vectorization.
  1271. E->UserTreeIndices.push_back(UserTreeIdx);
  1272. DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n");
  1273. return;
  1274. }
  1275. // Check that none of the instructions in the bundle are already in the tree.
  1276. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1277. auto *I = dyn_cast<Instruction>(VL[i]);
  1278. if (!I)
  1279. continue;
  1280. if (getTreeEntry(I)) {
  1281. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  1282. ") is already in tree.\n");
  1283. newTreeEntry(VL, false, UserTreeIdx, S);
  1284. return;
  1285. }
  1286. }
  1287. // If any of the scalars is marked as a value that needs to stay scalar, then
  1288. // we need to gather the scalars.
  1289. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1290. if (MustGather.count(VL[i])) {
  1291. DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
  1292. newTreeEntry(VL, false, UserTreeIdx, S);
  1293. return;
  1294. }
  1295. }
  1296. // Check that all of the users of the scalars that we want to vectorize are
  1297. // schedulable.
  1298. auto *VL0 = cast<Instruction>(S.OpValue);
  1299. BasicBlock *BB = VL0->getParent();
  1300. if (!DT->isReachableFromEntry(BB)) {
  1301. // Don't go into unreachable blocks. They may contain instructions with
  1302. // dependency cycles which confuse the final scheduling.
  1303. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
  1304. newTreeEntry(VL, false, UserTreeIdx, S);
  1305. return;
  1306. }
  1307. // Check that every instruction appears once in this bundle.
  1308. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  1309. for (unsigned j = i + 1; j < e; ++j)
  1310. if (VL[i] == VL[j]) {
  1311. DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
  1312. newTreeEntry(VL, false, UserTreeIdx, S);
  1313. return;
  1314. }
  1315. auto &BSRef = BlocksSchedules[BB];
  1316. if (!BSRef)
  1317. BSRef = llvm::make_unique<BlockScheduling>(BB);
  1318. BlockScheduling &BS = *BSRef.get();
  1319. if (!BS.tryScheduleBundle(VL, this, S.OpValue)) {
  1320. DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
  1321. assert((!BS.getScheduleData(VL0) ||
  1322. !BS.getScheduleData(VL0)->isPartOfBundle()) &&
  1323. "tryScheduleBundle should cancelScheduling on failure");
  1324. newTreeEntry(VL, false, UserTreeIdx, S);
  1325. return;
  1326. }
  1327. DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
  1328. unsigned ShuffleOrOp = S.IsAltShuffle ?
  1329. (unsigned) Instruction::ShuffleVector : S.Opcode;
  1330. switch (ShuffleOrOp) {
  1331. case Instruction::PHI: {
  1332. PHINode *PH = dyn_cast<PHINode>(VL0);
  1333. // Check for terminator values (e.g. invoke).
  1334. for (unsigned j = 0; j < VL.size(); ++j)
  1335. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1336. TerminatorInst *Term = dyn_cast<TerminatorInst>(
  1337. cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
  1338. if (Term) {
  1339. DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
  1340. BS.cancelScheduling(VL, VL0);
  1341. newTreeEntry(VL, false, UserTreeIdx, S);
  1342. return;
  1343. }
  1344. }
  1345. newTreeEntry(VL, true, UserTreeIdx, S);
  1346. DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
  1347. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1348. ValueList Operands;
  1349. // Prepare the operand vector.
  1350. for (Value *j : VL)
  1351. Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
  1352. PH->getIncomingBlock(i)));
  1353. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1354. }
  1355. return;
  1356. }
  1357. case Instruction::ExtractValue:
  1358. case Instruction::ExtractElement: {
  1359. bool Reuse = canReuseExtract(VL, VL0);
  1360. if (Reuse) {
  1361. DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
  1362. } else {
  1363. BS.cancelScheduling(VL, VL0);
  1364. }
  1365. newTreeEntry(VL, Reuse, UserTreeIdx, S);
  1366. return;
  1367. }
  1368. case Instruction::Load: {
  1369. // Check that a vectorized load would load the same memory as a scalar
  1370. // load. For example, we don't want to vectorize loads that are smaller
  1371. // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
  1372. // treats loading/storing it as an i8 struct. If we vectorize loads/stores
  1373. // from such a struct, we read/write packed bits disagreeing with the
  1374. // unvectorized version.
  1375. Type *ScalarTy = VL0->getType();
  1376. if (DL->getTypeSizeInBits(ScalarTy) !=
  1377. DL->getTypeAllocSizeInBits(ScalarTy)) {
  1378. BS.cancelScheduling(VL, VL0);
  1379. newTreeEntry(VL, false, UserTreeIdx, S);
  1380. DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
  1381. return;
  1382. }
  1383. // Make sure all loads in the bundle are simple - we can't vectorize
  1384. // atomic or volatile loads.
  1385. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  1386. LoadInst *L = cast<LoadInst>(VL[i]);
  1387. if (!L->isSimple()) {
  1388. BS.cancelScheduling(VL, VL0);
  1389. newTreeEntry(VL, false, UserTreeIdx, S);
  1390. DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
  1391. return;
  1392. }
  1393. }
  1394. // Check if the loads are consecutive, reversed, or neither.
  1395. bool Consecutive = true;
  1396. bool ReverseConsecutive = true;
  1397. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  1398. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1399. Consecutive = false;
  1400. break;
  1401. } else {
  1402. ReverseConsecutive = false;
  1403. }
  1404. }
  1405. if (Consecutive) {
  1406. ++NumLoadsWantToKeepOrder;
  1407. newTreeEntry(VL, true, UserTreeIdx, S);
  1408. DEBUG(dbgs() << "SLP: added a vector of loads.\n");
  1409. return;
  1410. }
  1411. // If none of the load pairs were consecutive when checked in order,
  1412. // check the reverse order.
  1413. if (ReverseConsecutive)
  1414. for (unsigned i = VL.size() - 1; i > 0; --i)
  1415. if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
  1416. ReverseConsecutive = false;
  1417. break;
  1418. }
  1419. if (ReverseConsecutive) {
  1420. DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
  1421. ++NumLoadsWantToChangeOrder;
  1422. BS.cancelScheduling(VL, VL0);
  1423. newTreeEntry(VL, false, UserTreeIdx, S);
  1424. return;
  1425. }
  1426. if (VL.size() > 2) {
  1427. bool ShuffledLoads = true;
  1428. SmallVector<Value *, 8> Sorted;
  1429. SmallVector<unsigned, 4> Mask;
  1430. if (sortLoadAccesses(VL, *DL, *SE, Sorted, &Mask)) {
  1431. auto NewVL = makeArrayRef(Sorted.begin(), Sorted.end());
  1432. for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
  1433. if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
  1434. ShuffledLoads = false;
  1435. break;
  1436. }
  1437. }
  1438. // TODO: Tracking how many load wants to have arbitrary shuffled order
  1439. // would be usefull.
  1440. if (ShuffledLoads) {
  1441. DEBUG(dbgs() << "SLP: added a vector of loads which needs "
  1442. "permutation of loaded lanes.\n");
  1443. newTreeEntry(NewVL, true, UserTreeIdx, S,
  1444. makeArrayRef(Mask.begin(), Mask.end()), OpdNum);
  1445. return;
  1446. }
  1447. }
  1448. }
  1449. DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
  1450. BS.cancelScheduling(VL, VL0);
  1451. newTreeEntry(VL, false, UserTreeIdx, S);
  1452. return;
  1453. }
  1454. case Instruction::ZExt:
  1455. case Instruction::SExt:
  1456. case Instruction::FPToUI:
  1457. case Instruction::FPToSI:
  1458. case Instruction::FPExt:
  1459. case Instruction::PtrToInt:
  1460. case Instruction::IntToPtr:
  1461. case Instruction::SIToFP:
  1462. case Instruction::UIToFP:
  1463. case Instruction::Trunc:
  1464. case Instruction::FPTrunc:
  1465. case Instruction::BitCast: {
  1466. Type *SrcTy = VL0->getOperand(0)->getType();
  1467. for (unsigned i = 0; i < VL.size(); ++i) {
  1468. Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
  1469. if (Ty != SrcTy || !isValidElementType(Ty)) {
  1470. BS.cancelScheduling(VL, VL0);
  1471. newTreeEntry(VL, false, UserTreeIdx, S);
  1472. DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
  1473. return;
  1474. }
  1475. }
  1476. newTreeEntry(VL, true, UserTreeIdx, S);
  1477. DEBUG(dbgs() << "SLP: added a vector of casts.\n");
  1478. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1479. ValueList Operands;
  1480. // Prepare the operand vector.
  1481. for (Value *j : VL)
  1482. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1483. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1484. }
  1485. return;
  1486. }
  1487. case Instruction::ICmp:
  1488. case Instruction::FCmp: {
  1489. // Check that all of the compares have the same predicate.
  1490. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1491. Type *ComparedTy = VL0->getOperand(0)->getType();
  1492. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  1493. CmpInst *Cmp = cast<CmpInst>(VL[i]);
  1494. if (Cmp->getPredicate() != P0 ||
  1495. Cmp->getOperand(0)->getType() != ComparedTy) {
  1496. BS.cancelScheduling(VL, VL0);
  1497. newTreeEntry(VL, false, UserTreeIdx, S);
  1498. DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
  1499. return;
  1500. }
  1501. }
  1502. newTreeEntry(VL, true, UserTreeIdx, S);
  1503. DEBUG(dbgs() << "SLP: added a vector of compares.\n");
  1504. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1505. ValueList Operands;
  1506. // Prepare the operand vector.
  1507. for (Value *j : VL)
  1508. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1509. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1510. }
  1511. return;
  1512. }
  1513. case Instruction::Select:
  1514. case Instruction::Add:
  1515. case Instruction::FAdd:
  1516. case Instruction::Sub:
  1517. case Instruction::FSub:
  1518. case Instruction::Mul:
  1519. case Instruction::FMul:
  1520. case Instruction::UDiv:
  1521. case Instruction::SDiv:
  1522. case Instruction::FDiv:
  1523. case Instruction::URem:
  1524. case Instruction::SRem:
  1525. case Instruction::FRem:
  1526. case Instruction::Shl:
  1527. case Instruction::LShr:
  1528. case Instruction::AShr:
  1529. case Instruction::And:
  1530. case Instruction::Or:
  1531. case Instruction::Xor:
  1532. newTreeEntry(VL, true, UserTreeIdx, S);
  1533. DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
  1534. // Sort operands of the instructions so that each side is more likely to
  1535. // have the same opcode.
  1536. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
  1537. ValueList Left, Right;
  1538. reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right);
  1539. buildTree_rec(Left, Depth + 1, UserTreeIdx);
  1540. buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
  1541. return;
  1542. }
  1543. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1544. ValueList Operands;
  1545. // Prepare the operand vector.
  1546. for (Value *j : VL)
  1547. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1548. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1549. }
  1550. return;
  1551. case Instruction::GetElementPtr: {
  1552. // We don't combine GEPs with complicated (nested) indexing.
  1553. for (unsigned j = 0; j < VL.size(); ++j) {
  1554. if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
  1555. DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
  1556. BS.cancelScheduling(VL, VL0);
  1557. newTreeEntry(VL, false, UserTreeIdx, S);
  1558. return;
  1559. }
  1560. }
  1561. // We can't combine several GEPs into one vector if they operate on
  1562. // different types.
  1563. Type *Ty0 = VL0->getOperand(0)->getType();
  1564. for (unsigned j = 0; j < VL.size(); ++j) {
  1565. Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
  1566. if (Ty0 != CurTy) {
  1567. DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
  1568. BS.cancelScheduling(VL, VL0);
  1569. newTreeEntry(VL, false, UserTreeIdx, S);
  1570. return;
  1571. }
  1572. }
  1573. // We don't combine GEPs with non-constant indexes.
  1574. for (unsigned j = 0; j < VL.size(); ++j) {
  1575. auto Op = cast<Instruction>(VL[j])->getOperand(1);
  1576. if (!isa<ConstantInt>(Op)) {
  1577. DEBUG(
  1578. dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
  1579. BS.cancelScheduling(VL, VL0);
  1580. newTreeEntry(VL, false, UserTreeIdx, S);
  1581. return;
  1582. }
  1583. }
  1584. newTreeEntry(VL, true, UserTreeIdx, S);
  1585. DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
  1586. for (unsigned i = 0, e = 2; i < e; ++i) {
  1587. ValueList Operands;
  1588. // Prepare the operand vector.
  1589. for (Value *j : VL)
  1590. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1591. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1592. }
  1593. return;
  1594. }
  1595. case Instruction::Store: {
  1596. // Check if the stores are consecutive or of we need to swizzle them.
  1597. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
  1598. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1599. BS.cancelScheduling(VL, VL0);
  1600. newTreeEntry(VL, false, UserTreeIdx, S);
  1601. DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
  1602. return;
  1603. }
  1604. newTreeEntry(VL, true, UserTreeIdx, S);
  1605. DEBUG(dbgs() << "SLP: added a vector of stores.\n");
  1606. ValueList Operands;
  1607. for (Value *j : VL)
  1608. Operands.push_back(cast<Instruction>(j)->getOperand(0));
  1609. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1610. return;
  1611. }
  1612. case Instruction::Call: {
  1613. // Check if the calls are all to the same vectorizable intrinsic.
  1614. CallInst *CI = cast<CallInst>(VL0);
  1615. // Check if this is an Intrinsic call or something that can be
  1616. // represented by an intrinsic call
  1617. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1618. if (!isTriviallyVectorizable(ID)) {
  1619. BS.cancelScheduling(VL, VL0);
  1620. newTreeEntry(VL, false, UserTreeIdx, S);
  1621. DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
  1622. return;
  1623. }
  1624. Function *Int = CI->getCalledFunction();
  1625. Value *A1I = nullptr;
  1626. if (hasVectorInstrinsicScalarOpd(ID, 1))
  1627. A1I = CI->getArgOperand(1);
  1628. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1629. CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
  1630. if (!CI2 || CI2->getCalledFunction() != Int ||
  1631. getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
  1632. !CI->hasIdenticalOperandBundleSchema(*CI2)) {
  1633. BS.cancelScheduling(VL, VL0);
  1634. newTreeEntry(VL, false, UserTreeIdx, S);
  1635. DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
  1636. << "\n");
  1637. return;
  1638. }
  1639. // ctlz,cttz and powi are special intrinsics whose second argument
  1640. // should be same in order for them to be vectorized.
  1641. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  1642. Value *A1J = CI2->getArgOperand(1);
  1643. if (A1I != A1J) {
  1644. BS.cancelScheduling(VL, VL0);
  1645. newTreeEntry(VL, false, UserTreeIdx, S);
  1646. DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
  1647. << " argument "<< A1I<<"!=" << A1J
  1648. << "\n");
  1649. return;
  1650. }
  1651. }
  1652. // Verify that the bundle operands are identical between the two calls.
  1653. if (CI->hasOperandBundles() &&
  1654. !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
  1655. CI->op_begin() + CI->getBundleOperandsEndIndex(),
  1656. CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
  1657. BS.cancelScheduling(VL, VL0);
  1658. newTreeEntry(VL, false, UserTreeIdx, S);
  1659. DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
  1660. << *VL[i] << '\n');
  1661. return;
  1662. }
  1663. }
  1664. newTreeEntry(VL, true, UserTreeIdx, S);
  1665. for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
  1666. ValueList Operands;
  1667. // Prepare the operand vector.
  1668. for (Value *j : VL) {
  1669. CallInst *CI2 = dyn_cast<CallInst>(j);
  1670. Operands.push_back(CI2->getArgOperand(i));
  1671. }
  1672. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1673. }
  1674. return;
  1675. }
  1676. case Instruction::ShuffleVector:
  1677. // If this is not an alternate sequence of opcode like add-sub
  1678. // then do not vectorize this instruction.
  1679. if (!S.IsAltShuffle) {
  1680. BS.cancelScheduling(VL, VL0);
  1681. newTreeEntry(VL, false, UserTreeIdx, S);
  1682. DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
  1683. return;
  1684. }
  1685. newTreeEntry(VL, true, UserTreeIdx, S);
  1686. DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
  1687. // Reorder operands if reordering would enable vectorization.
  1688. if (isa<BinaryOperator>(VL0)) {
  1689. ValueList Left, Right;
  1690. reorderAltShuffleOperands(S.Opcode, VL, Left, Right);
  1691. buildTree_rec(Left, Depth + 1, UserTreeIdx);
  1692. buildTree_rec(Right, Depth + 1, UserTreeIdx, 1);
  1693. return;
  1694. }
  1695. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1696. ValueList Operands;
  1697. // Prepare the operand vector.
  1698. for (Value *j : VL)
  1699. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1700. buildTree_rec(Operands, Depth + 1, UserTreeIdx, i);
  1701. }
  1702. return;
  1703. default:
  1704. BS.cancelScheduling(VL, VL0);
  1705. newTreeEntry(VL, false, UserTreeIdx, S);
  1706. DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
  1707. return;
  1708. }
  1709. }
  1710. unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
  1711. unsigned N;
  1712. Type *EltTy;
  1713. auto *ST = dyn_cast<StructType>(T);
  1714. if (ST) {
  1715. N = ST->getNumElements();
  1716. EltTy = *ST->element_begin();
  1717. } else {
  1718. N = cast<ArrayType>(T)->getNumElements();
  1719. EltTy = cast<ArrayType>(T)->getElementType();
  1720. }
  1721. if (!isValidElementType(EltTy))
  1722. return 0;
  1723. uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
  1724. if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
  1725. return 0;
  1726. if (ST) {
  1727. // Check that struct is homogeneous.
  1728. for (const auto *Ty : ST->elements())
  1729. if (Ty != EltTy)
  1730. return 0;
  1731. }
  1732. return N;
  1733. }
  1734. bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
  1735. Instruction *E0 = cast<Instruction>(OpValue);
  1736. assert(E0->getOpcode() == Instruction::ExtractElement ||
  1737. E0->getOpcode() == Instruction::ExtractValue);
  1738. assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode");
  1739. // Check if all of the extracts come from the same vector and from the
  1740. // correct offset.
  1741. Value *Vec = E0->getOperand(0);
  1742. // We have to extract from a vector/aggregate with the same number of elements.
  1743. unsigned NElts;
  1744. if (E0->getOpcode() == Instruction::ExtractValue) {
  1745. const DataLayout &DL = E0->getModule()->getDataLayout();
  1746. NElts = canMapToVector(Vec->getType(), DL);
  1747. if (!NElts)
  1748. return false;
  1749. // Check if load can be rewritten as load of vector.
  1750. LoadInst *LI = dyn_cast<LoadInst>(Vec);
  1751. if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
  1752. return false;
  1753. } else {
  1754. NElts = Vec->getType()->getVectorNumElements();
  1755. }
  1756. if (NElts != VL.size())
  1757. return false;
  1758. // Check that all of the indices extract from the correct offset.
  1759. for (unsigned I = 0, E = VL.size(); I < E; ++I) {
  1760. Instruction *Inst = cast<Instruction>(VL[I]);
  1761. if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
  1762. return false;
  1763. if (Inst->getOperand(0) != Vec)
  1764. return false;
  1765. }
  1766. return true;
  1767. }
  1768. bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
  1769. return I->hasOneUse() ||
  1770. std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
  1771. return ScalarToTreeEntry.count(U) > 0;
  1772. });
  1773. }
  1774. int BoUpSLP::getEntryCost(TreeEntry *E) {
  1775. ArrayRef<Value*> VL = E->Scalars;
  1776. Type *ScalarTy = VL[0]->getType();
  1777. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1778. ScalarTy = SI->getValueOperand()->getType();
  1779. else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
  1780. ScalarTy = CI->getOperand(0)->getType();
  1781. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1782. // If we have computed a smaller type for the expression, update VecTy so
  1783. // that the costs will be accurate.
  1784. if (MinBWs.count(VL[0]))
  1785. VecTy = VectorType::get(
  1786. IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
  1787. if (E->NeedToGather) {
  1788. if (allConstant(VL))
  1789. return 0;
  1790. if (isSplat(VL)) {
  1791. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  1792. }
  1793. if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) {
  1794. Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
  1795. if (ShuffleKind.hasValue()) {
  1796. int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
  1797. for (auto *V : VL) {
  1798. // If all users of instruction are going to be vectorized and this
  1799. // instruction itself is not going to be vectorized, consider this
  1800. // instruction as dead and remove its cost from the final cost of the
  1801. // vectorized tree.
  1802. if (areAllUsersVectorized(cast<Instruction>(V)) &&
  1803. !ScalarToTreeEntry.count(V)) {
  1804. auto *IO = cast<ConstantInt>(
  1805. cast<ExtractElementInst>(V)->getIndexOperand());
  1806. Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
  1807. IO->getZExtValue());
  1808. }
  1809. }
  1810. return Cost;
  1811. }
  1812. }
  1813. return getGatherCost(E->Scalars);
  1814. }
  1815. InstructionsState S = getSameOpcode(VL);
  1816. assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
  1817. Instruction *VL0 = cast<Instruction>(S.OpValue);
  1818. unsigned ShuffleOrOp = S.IsAltShuffle ?
  1819. (unsigned) Instruction::ShuffleVector : S.Opcode;
  1820. switch (ShuffleOrOp) {
  1821. case Instruction::PHI:
  1822. return 0;
  1823. case Instruction::ExtractValue:
  1824. case Instruction::ExtractElement:
  1825. if (canReuseExtract(VL, S.OpValue)) {
  1826. int DeadCost = 0;
  1827. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1828. Instruction *E = cast<Instruction>(VL[i]);
  1829. // If all users are going to be vectorized, instruction can be
  1830. // considered as dead.
  1831. // The same, if have only one user, it will be vectorized for sure.
  1832. if (areAllUsersVectorized(E))
  1833. // Take credit for instruction that will become dead.
  1834. DeadCost +=
  1835. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
  1836. }
  1837. return -DeadCost;
  1838. }
  1839. return getGatherCost(VecTy);
  1840. case Instruction::ZExt:
  1841. case Instruction::SExt:
  1842. case Instruction::FPToUI:
  1843. case Instruction::FPToSI:
  1844. case Instruction::FPExt:
  1845. case Instruction::PtrToInt:
  1846. case Instruction::IntToPtr:
  1847. case Instruction::SIToFP:
  1848. case Instruction::UIToFP:
  1849. case Instruction::Trunc:
  1850. case Instruction::FPTrunc:
  1851. case Instruction::BitCast: {
  1852. Type *SrcTy = VL0->getOperand(0)->getType();
  1853. // Calculate the cost of this instruction.
  1854. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
  1855. VL0->getType(), SrcTy, VL0);
  1856. VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
  1857. int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
  1858. return VecCost - ScalarCost;
  1859. }
  1860. case Instruction::FCmp:
  1861. case Instruction::ICmp:
  1862. case Instruction::Select: {
  1863. // Calculate the cost of this instruction.
  1864. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
  1865. int ScalarCost = VecTy->getNumElements() *
  1866. TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
  1867. int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0);
  1868. return VecCost - ScalarCost;
  1869. }
  1870. case Instruction::Add:
  1871. case Instruction::FAdd:
  1872. case Instruction::Sub:
  1873. case Instruction::FSub:
  1874. case Instruction::Mul:
  1875. case Instruction::FMul:
  1876. case Instruction::UDiv:
  1877. case Instruction::SDiv:
  1878. case Instruction::FDiv:
  1879. case Instruction::URem:
  1880. case Instruction::SRem:
  1881. case Instruction::FRem:
  1882. case Instruction::Shl:
  1883. case Instruction::LShr:
  1884. case Instruction::AShr:
  1885. case Instruction::And:
  1886. case Instruction::Or:
  1887. case Instruction::Xor: {
  1888. // Certain instructions can be cheaper to vectorize if they have a
  1889. // constant second vector operand.
  1890. TargetTransformInfo::OperandValueKind Op1VK =
  1891. TargetTransformInfo::OK_AnyValue;
  1892. TargetTransformInfo::OperandValueKind Op2VK =
  1893. TargetTransformInfo::OK_UniformConstantValue;
  1894. TargetTransformInfo::OperandValueProperties Op1VP =
  1895. TargetTransformInfo::OP_None;
  1896. TargetTransformInfo::OperandValueProperties Op2VP =
  1897. TargetTransformInfo::OP_None;
  1898. // If all operands are exactly the same ConstantInt then set the
  1899. // operand kind to OK_UniformConstantValue.
  1900. // If instead not all operands are constants, then set the operand kind
  1901. // to OK_AnyValue. If all operands are constants but not the same,
  1902. // then set the operand kind to OK_NonUniformConstantValue.
  1903. ConstantInt *CInt = nullptr;
  1904. for (unsigned i = 0; i < VL.size(); ++i) {
  1905. const Instruction *I = cast<Instruction>(VL[i]);
  1906. if (!isa<ConstantInt>(I->getOperand(1))) {
  1907. Op2VK = TargetTransformInfo::OK_AnyValue;
  1908. break;
  1909. }
  1910. if (i == 0) {
  1911. CInt = cast<ConstantInt>(I->getOperand(1));
  1912. continue;
  1913. }
  1914. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
  1915. CInt != cast<ConstantInt>(I->getOperand(1)))
  1916. Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
  1917. }
  1918. // FIXME: Currently cost of model modification for division by power of
  1919. // 2 is handled for X86 and AArch64. Add support for other targets.
  1920. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
  1921. CInt->getValue().isPowerOf2())
  1922. Op2VP = TargetTransformInfo::OP_PowerOf2;
  1923. SmallVector<const Value *, 4> Operands(VL0->operand_values());
  1924. int ScalarCost =
  1925. VecTy->getNumElements() *
  1926. TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
  1927. Op2VP, Operands);
  1928. int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK,
  1929. Op1VP, Op2VP, Operands);
  1930. return VecCost - ScalarCost;
  1931. }
  1932. case Instruction::GetElementPtr: {
  1933. TargetTransformInfo::OperandValueKind Op1VK =
  1934. TargetTransformInfo::OK_AnyValue;
  1935. TargetTransformInfo::OperandValueKind Op2VK =
  1936. TargetTransformInfo::OK_UniformConstantValue;
  1937. int ScalarCost =
  1938. VecTy->getNumElements() *
  1939. TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
  1940. int VecCost =
  1941. TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
  1942. return VecCost - ScalarCost;
  1943. }
  1944. case Instruction::Load: {
  1945. // Cost of wide load - cost of scalar loads.
  1946. unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
  1947. int ScalarLdCost = VecTy->getNumElements() *
  1948. TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
  1949. int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
  1950. VecTy, alignment, 0, VL0);
  1951. return VecLdCost - ScalarLdCost;
  1952. }
  1953. case Instruction::Store: {
  1954. // We know that we can merge the stores. Calculate the cost.
  1955. unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
  1956. int ScalarStCost = VecTy->getNumElements() *
  1957. TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
  1958. int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
  1959. VecTy, alignment, 0, VL0);
  1960. return VecStCost - ScalarStCost;
  1961. }
  1962. case Instruction::Call: {
  1963. CallInst *CI = cast<CallInst>(VL0);
  1964. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1965. // Calculate the cost of the scalar and vector calls.
  1966. SmallVector<Type*, 4> ScalarTys;
  1967. for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
  1968. ScalarTys.push_back(CI->getArgOperand(op)->getType());
  1969. FastMathFlags FMF;
  1970. if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
  1971. FMF = FPMO->getFastMathFlags();
  1972. int ScalarCallCost = VecTy->getNumElements() *
  1973. TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
  1974. SmallVector<Value *, 4> Args(CI->arg_operands());
  1975. int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
  1976. VecTy->getNumElements());
  1977. DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
  1978. << " (" << VecCallCost << "-" << ScalarCallCost << ")"
  1979. << " for " << *CI << "\n");
  1980. return VecCallCost - ScalarCallCost;
  1981. }
  1982. case Instruction::ShuffleVector: {
  1983. TargetTransformInfo::OperandValueKind Op1VK =
  1984. TargetTransformInfo::OK_AnyValue;
  1985. TargetTransformInfo::OperandValueKind Op2VK =
  1986. TargetTransformInfo::OK_AnyValue;
  1987. int ScalarCost = 0;
  1988. int VecCost = 0;
  1989. for (Value *i : VL) {
  1990. Instruction *I = cast<Instruction>(i);
  1991. if (!I)
  1992. break;
  1993. ScalarCost +=
  1994. TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
  1995. }
  1996. // VecCost is equal to sum of the cost of creating 2 vectors
  1997. // and the cost of creating shuffle.
  1998. Instruction *I0 = cast<Instruction>(VL[0]);
  1999. VecCost =
  2000. TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
  2001. Instruction *I1 = cast<Instruction>(VL[1]);
  2002. VecCost +=
  2003. TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
  2004. VecCost +=
  2005. TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
  2006. return VecCost - ScalarCost;
  2007. }
  2008. default:
  2009. llvm_unreachable("Unknown instruction");
  2010. }
  2011. }
  2012. bool BoUpSLP::isFullyVectorizableTinyTree() {
  2013. DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
  2014. VectorizableTree.size() << " is fully vectorizable .\n");
  2015. // We only handle trees of heights 1 and 2.
  2016. if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
  2017. return true;
  2018. if (VectorizableTree.size() != 2)
  2019. return false;
  2020. // Handle splat and all-constants stores.
  2021. if (!VectorizableTree[0].NeedToGather &&
  2022. (allConstant(VectorizableTree[1].Scalars) ||
  2023. isSplat(VectorizableTree[1].Scalars)))
  2024. return true;
  2025. // Gathering cost would be too much for tiny trees.
  2026. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
  2027. return false;
  2028. return true;
  2029. }
  2030. bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
  2031. // We can vectorize the tree if its size is greater than or equal to the
  2032. // minimum size specified by the MinTreeSize command line option.
  2033. if (VectorizableTree.size() >= MinTreeSize)
  2034. return false;
  2035. // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
  2036. // can vectorize it if we can prove it fully vectorizable.
  2037. if (isFullyVectorizableTinyTree())
  2038. return false;
  2039. assert(VectorizableTree.empty()
  2040. ? ExternalUses.empty()
  2041. : true && "We shouldn't have any external users");
  2042. // Otherwise, we can't vectorize the tree. It is both tiny and not fully
  2043. // vectorizable.
  2044. return true;
  2045. }
  2046. int BoUpSLP::getSpillCost() {
  2047. // Walk from the bottom of the tree to the top, tracking which values are
  2048. // live. When we see a call instruction that is not part of our tree,
  2049. // query TTI to see if there is a cost to keeping values live over it
  2050. // (for example, if spills and fills are required).
  2051. unsigned BundleWidth = VectorizableTree.front().Scalars.size();
  2052. int Cost = 0;
  2053. SmallPtrSet<Instruction*, 4> LiveValues;
  2054. Instruction *PrevInst = nullptr;
  2055. for (const auto &N : VectorizableTree) {
  2056. Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
  2057. if (!Inst)
  2058. continue;
  2059. if (!PrevInst) {
  2060. PrevInst = Inst;
  2061. continue;
  2062. }
  2063. // Update LiveValues.
  2064. LiveValues.erase(PrevInst);
  2065. for (auto &J : PrevInst->operands()) {
  2066. if (isa<Instruction>(&*J) && getTreeEntry(&*J))
  2067. LiveValues.insert(cast<Instruction>(&*J));
  2068. }
  2069. DEBUG(
  2070. dbgs() << "SLP: #LV: " << LiveValues.size();
  2071. for (auto *X : LiveValues)
  2072. dbgs() << " " << X->getName();
  2073. dbgs() << ", Looking at ";
  2074. Inst->dump();
  2075. );
  2076. // Now find the sequence of instructions between PrevInst and Inst.
  2077. BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
  2078. PrevInstIt =
  2079. PrevInst->getIterator().getReverse();
  2080. while (InstIt != PrevInstIt) {
  2081. if (PrevInstIt == PrevInst->getParent()->rend()) {
  2082. PrevInstIt = Inst->getParent()->rbegin();
  2083. continue;
  2084. }
  2085. if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
  2086. SmallVector<Type*, 4> V;
  2087. for (auto *II : LiveValues)
  2088. V.push_back(VectorType::get(II->getType(), BundleWidth));
  2089. Cost += TTI->getCostOfKeepingLiveOverCall(V);
  2090. }
  2091. ++PrevInstIt;
  2092. }
  2093. PrevInst = Inst;
  2094. }
  2095. return Cost;
  2096. }
  2097. int BoUpSLP::getTreeCost() {
  2098. int Cost = 0;
  2099. DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
  2100. VectorizableTree.size() << ".\n");
  2101. unsigned BundleWidth = VectorizableTree[0].Scalars.size();
  2102. for (TreeEntry &TE : VectorizableTree) {
  2103. int C = getEntryCost(&TE);
  2104. DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
  2105. << *TE.Scalars[0] << ".\n");
  2106. Cost += C;
  2107. }
  2108. SmallSet<Value *, 16> ExtractCostCalculated;
  2109. int ExtractCost = 0;
  2110. for (ExternalUser &EU : ExternalUses) {
  2111. // We only add extract cost once for the same scalar.
  2112. if (!ExtractCostCalculated.insert(EU.Scalar).second)
  2113. continue;
  2114. // Uses by ephemeral values are free (because the ephemeral value will be
  2115. // removed prior to code generation, and so the extraction will be
  2116. // removed as well).
  2117. if (EphValues.count(EU.User))
  2118. continue;
  2119. // If we plan to rewrite the tree in a smaller type, we will need to sign
  2120. // extend the extracted value back to the original type. Here, we account
  2121. // for the extract and the added cost of the sign extend if needed.
  2122. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
  2123. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  2124. if (MinBWs.count(ScalarRoot)) {
  2125. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
  2126. auto Extend =
  2127. MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
  2128. VecTy = VectorType::get(MinTy, BundleWidth);
  2129. ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
  2130. VecTy, EU.Lane);
  2131. } else {
  2132. ExtractCost +=
  2133. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
  2134. }
  2135. }
  2136. int SpillCost = getSpillCost();
  2137. Cost += SpillCost + ExtractCost;
  2138. std::string Str;
  2139. {
  2140. raw_string_ostream OS(Str);
  2141. OS << "SLP: Spill Cost = " << SpillCost << ".\n"
  2142. << "SLP: Extract Cost = " << ExtractCost << ".\n"
  2143. << "SLP: Total Cost = " << Cost << ".\n";
  2144. }
  2145. DEBUG(dbgs() << Str);
  2146. if (ViewSLPTree)
  2147. ViewGraph(this, "SLP" + F->getName(), false, Str);
  2148. return Cost;
  2149. }
  2150. int BoUpSLP::getGatherCost(Type *Ty) {
  2151. int Cost = 0;
  2152. for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
  2153. Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  2154. return Cost;
  2155. }
  2156. int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
  2157. // Find the type of the operands in VL.
  2158. Type *ScalarTy = VL[0]->getType();
  2159. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  2160. ScalarTy = SI->getValueOperand()->getType();
  2161. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  2162. // Find the cost of inserting/extracting values from the vector.
  2163. return getGatherCost(VecTy);
  2164. }
  2165. // Reorder commutative operations in alternate shuffle if the resulting vectors
  2166. // are consecutive loads. This would allow us to vectorize the tree.
  2167. // If we have something like-
  2168. // load a[0] - load b[0]
  2169. // load b[1] + load a[1]
  2170. // load a[2] - load b[2]
  2171. // load a[3] + load b[3]
  2172. // Reordering the second load b[1] load a[1] would allow us to vectorize this
  2173. // code.
  2174. void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef<Value *> VL,
  2175. SmallVectorImpl<Value *> &Left,
  2176. SmallVectorImpl<Value *> &Right) {
  2177. // Push left and right operands of binary operation into Left and Right
  2178. unsigned AltOpcode = getAltOpcode(Opcode);
  2179. (void)AltOpcode;
  2180. for (Value *V : VL) {
  2181. auto *I = cast<Instruction>(V);
  2182. assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) &&
  2183. "Incorrect instruction in vector");
  2184. Left.push_back(I->getOperand(0));
  2185. Right.push_back(I->getOperand(1));
  2186. }
  2187. // Reorder if we have a commutative operation and consecutive access
  2188. // are on either side of the alternate instructions.
  2189. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  2190. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  2191. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  2192. Instruction *VL1 = cast<Instruction>(VL[j]);
  2193. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  2194. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  2195. std::swap(Left[j], Right[j]);
  2196. continue;
  2197. } else if (VL2->isCommutative() &&
  2198. isConsecutiveAccess(L, L1, *DL, *SE)) {
  2199. std::swap(Left[j + 1], Right[j + 1]);
  2200. continue;
  2201. }
  2202. // else unchanged
  2203. }
  2204. }
  2205. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  2206. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  2207. Instruction *VL1 = cast<Instruction>(VL[j]);
  2208. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  2209. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  2210. std::swap(Left[j], Right[j]);
  2211. continue;
  2212. } else if (VL2->isCommutative() &&
  2213. isConsecutiveAccess(L, L1, *DL, *SE)) {
  2214. std::swap(Left[j + 1], Right[j + 1]);
  2215. continue;
  2216. }
  2217. // else unchanged
  2218. }
  2219. }
  2220. }
  2221. }
  2222. // Return true if I should be commuted before adding it's left and right
  2223. // operands to the arrays Left and Right.
  2224. //
  2225. // The vectorizer is trying to either have all elements one side being
  2226. // instruction with the same opcode to enable further vectorization, or having
  2227. // a splat to lower the vectorizing cost.
  2228. static bool shouldReorderOperands(
  2229. int i, unsigned Opcode, Instruction &I, ArrayRef<Value *> Left,
  2230. ArrayRef<Value *> Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight,
  2231. bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) {
  2232. VLeft = I.getOperand(0);
  2233. VRight = I.getOperand(1);
  2234. // If we have "SplatRight", try to see if commuting is needed to preserve it.
  2235. if (SplatRight) {
  2236. if (VRight == Right[i - 1])
  2237. // Preserve SplatRight
  2238. return false;
  2239. if (VLeft == Right[i - 1]) {
  2240. // Commuting would preserve SplatRight, but we don't want to break
  2241. // SplatLeft either, i.e. preserve the original order if possible.
  2242. // (FIXME: why do we care?)
  2243. if (SplatLeft && VLeft == Left[i - 1])
  2244. return false;
  2245. return true;
  2246. }
  2247. }
  2248. // Symmetrically handle Right side.
  2249. if (SplatLeft) {
  2250. if (VLeft == Left[i - 1])
  2251. // Preserve SplatLeft
  2252. return false;
  2253. if (VRight == Left[i - 1])
  2254. return true;
  2255. }
  2256. Instruction *ILeft = dyn_cast<Instruction>(VLeft);
  2257. Instruction *IRight = dyn_cast<Instruction>(VRight);
  2258. // If we have "AllSameOpcodeRight", try to see if the left operands preserves
  2259. // it and not the right, in this case we want to commute.
  2260. if (AllSameOpcodeRight) {
  2261. unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
  2262. if (IRight && RightPrevOpcode == IRight->getOpcode())
  2263. // Do not commute, a match on the right preserves AllSameOpcodeRight
  2264. return false;
  2265. if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
  2266. // We have a match and may want to commute, but first check if there is
  2267. // not also a match on the existing operands on the Left to preserve
  2268. // AllSameOpcodeLeft, i.e. preserve the original order if possible.
  2269. // (FIXME: why do we care?)
  2270. if (AllSameOpcodeLeft && ILeft &&
  2271. cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
  2272. return false;
  2273. return true;
  2274. }
  2275. }
  2276. // Symmetrically handle Left side.
  2277. if (AllSameOpcodeLeft) {
  2278. unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
  2279. if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
  2280. return false;
  2281. if (IRight && LeftPrevOpcode == IRight->getOpcode())
  2282. return true;
  2283. }
  2284. return false;
  2285. }
  2286. void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode,
  2287. ArrayRef<Value *> VL,
  2288. SmallVectorImpl<Value *> &Left,
  2289. SmallVectorImpl<Value *> &Right) {
  2290. if (!VL.empty()) {
  2291. // Peel the first iteration out of the loop since there's nothing
  2292. // interesting to do anyway and it simplifies the checks in the loop.
  2293. auto *I = cast<Instruction>(VL[0]);
  2294. Value *VLeft = I->getOperand(0);
  2295. Value *VRight = I->getOperand(1);
  2296. if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
  2297. // Favor having instruction to the right. FIXME: why?
  2298. std::swap(VLeft, VRight);
  2299. Left.push_back(VLeft);
  2300. Right.push_back(VRight);
  2301. }
  2302. // Keep track if we have instructions with all the same opcode on one side.
  2303. bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
  2304. bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
  2305. // Keep track if we have one side with all the same value (broadcast).
  2306. bool SplatLeft = true;
  2307. bool SplatRight = true;
  2308. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  2309. Instruction *I = cast<Instruction>(VL[i]);
  2310. assert(((I->getOpcode() == Opcode && I->isCommutative()) ||
  2311. (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) &&
  2312. "Can only process commutative instruction");
  2313. // Commute to favor either a splat or maximizing having the same opcodes on
  2314. // one side.
  2315. Value *VLeft;
  2316. Value *VRight;
  2317. if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft,
  2318. AllSameOpcodeRight, SplatLeft, SplatRight, VLeft,
  2319. VRight)) {
  2320. Left.push_back(VRight);
  2321. Right.push_back(VLeft);
  2322. } else {
  2323. Left.push_back(VLeft);
  2324. Right.push_back(VRight);
  2325. }
  2326. // Update Splat* and AllSameOpcode* after the insertion.
  2327. SplatRight = SplatRight && (Right[i - 1] == Right[i]);
  2328. SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
  2329. AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
  2330. (cast<Instruction>(Left[i - 1])->getOpcode() ==
  2331. cast<Instruction>(Left[i])->getOpcode());
  2332. AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
  2333. (cast<Instruction>(Right[i - 1])->getOpcode() ==
  2334. cast<Instruction>(Right[i])->getOpcode());
  2335. }
  2336. // If one operand end up being broadcast, return this operand order.
  2337. if (SplatRight || SplatLeft)
  2338. return;
  2339. // Finally check if we can get longer vectorizable chain by reordering
  2340. // without breaking the good operand order detected above.
  2341. // E.g. If we have something like-
  2342. // load a[0] load b[0]
  2343. // load b[1] load a[1]
  2344. // load a[2] load b[2]
  2345. // load a[3] load b[3]
  2346. // Reordering the second load b[1] load a[1] would allow us to vectorize
  2347. // this code and we still retain AllSameOpcode property.
  2348. // FIXME: This load reordering might break AllSameOpcode in some rare cases
  2349. // such as-
  2350. // add a[0],c[0] load b[0]
  2351. // add a[1],c[2] load b[1]
  2352. // b[2] load b[2]
  2353. // add a[3],c[3] load b[3]
  2354. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  2355. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  2356. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  2357. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  2358. std::swap(Left[j + 1], Right[j + 1]);
  2359. continue;
  2360. }
  2361. }
  2362. }
  2363. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  2364. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  2365. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  2366. std::swap(Left[j + 1], Right[j + 1]);
  2367. continue;
  2368. }
  2369. }
  2370. }
  2371. // else unchanged
  2372. }
  2373. }
  2374. void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL, Value *OpValue) {
  2375. // Get the basic block this bundle is in. All instructions in the bundle
  2376. // should be in this block.
  2377. auto *Front = cast<Instruction>(OpValue);
  2378. auto *BB = Front->getParent();
  2379. const unsigned Opcode = cast<Instruction>(OpValue)->getOpcode();
  2380. const unsigned AltOpcode = getAltOpcode(Opcode);
  2381. assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool {
  2382. return !sameOpcodeOrAlt(Opcode, AltOpcode,
  2383. cast<Instruction>(V)->getOpcode()) ||
  2384. cast<Instruction>(V)->getParent() == BB;
  2385. }));
  2386. // The last instruction in the bundle in program order.
  2387. Instruction *LastInst = nullptr;
  2388. // Find the last instruction. The common case should be that BB has been
  2389. // scheduled, and the last instruction is VL.back(). So we start with
  2390. // VL.back() and iterate over schedule data until we reach the end of the
  2391. // bundle. The end of the bundle is marked by null ScheduleData.
  2392. if (BlocksSchedules.count(BB)) {
  2393. auto *Bundle =
  2394. BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back()));
  2395. if (Bundle && Bundle->isPartOfBundle())
  2396. for (; Bundle; Bundle = Bundle->NextInBundle)
  2397. if (Bundle->OpValue == Bundle->Inst)
  2398. LastInst = Bundle->Inst;
  2399. }
  2400. // LastInst can still be null at this point if there's either not an entry
  2401. // for BB in BlocksSchedules or there's no ScheduleData available for
  2402. // VL.back(). This can be the case if buildTree_rec aborts for various
  2403. // reasons (e.g., the maximum recursion depth is reached, the maximum region
  2404. // size is reached, etc.). ScheduleData is initialized in the scheduling
  2405. // "dry-run".
  2406. //
  2407. // If this happens, we can still find the last instruction by brute force. We
  2408. // iterate forwards from Front (inclusive) until we either see all
  2409. // instructions in the bundle or reach the end of the block. If Front is the
  2410. // last instruction in program order, LastInst will be set to Front, and we
  2411. // will visit all the remaining instructions in the block.
  2412. //
  2413. // One of the reasons we exit early from buildTree_rec is to place an upper
  2414. // bound on compile-time. Thus, taking an additional compile-time hit here is
  2415. // not ideal. However, this should be exceedingly rare since it requires that
  2416. // we both exit early from buildTree_rec and that the bundle be out-of-order
  2417. // (causing us to iterate all the way to the end of the block).
  2418. if (!LastInst) {
  2419. SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
  2420. for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
  2421. if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode()))
  2422. LastInst = &I;
  2423. if (Bundle.empty())
  2424. break;
  2425. }
  2426. }
  2427. // Set the insertion point after the last instruction in the bundle. Set the
  2428. // debug location to Front.
  2429. Builder.SetInsertPoint(BB, ++LastInst->getIterator());
  2430. Builder.SetCurrentDebugLocation(Front->getDebugLoc());
  2431. }
  2432. Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
  2433. Value *Vec = UndefValue::get(Ty);
  2434. // Generate the 'InsertElement' instruction.
  2435. for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
  2436. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
  2437. if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
  2438. GatherSeq.insert(Insrt);
  2439. CSEBlocks.insert(Insrt->getParent());
  2440. // Add to our 'need-to-extract' list.
  2441. if (TreeEntry *E = getTreeEntry(VL[i])) {
  2442. // Find which lane we need to extract.
  2443. int FoundLane = -1;
  2444. for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
  2445. // Is this the lane of the scalar that we are looking for ?
  2446. if (E->Scalars[Lane] == VL[i]) {
  2447. FoundLane = Lane;
  2448. break;
  2449. }
  2450. }
  2451. assert(FoundLane >= 0 && "Could not find the correct lane");
  2452. ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
  2453. }
  2454. }
  2455. }
  2456. return Vec;
  2457. }
  2458. Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
  2459. if (const TreeEntry *En = getTreeEntry(OpValue)) {
  2460. if (En->isSame(VL) && En->VectorizedValue)
  2461. return En->VectorizedValue;
  2462. }
  2463. return nullptr;
  2464. }
  2465. Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int OpdNum, int UserIndx) {
  2466. InstructionsState S = getSameOpcode(VL);
  2467. if (S.Opcode) {
  2468. if (TreeEntry *E = getTreeEntry(S.OpValue)) {
  2469. TreeEntry *UserTreeEntry = nullptr;
  2470. if (UserIndx != -1)
  2471. UserTreeEntry = &VectorizableTree[UserIndx];
  2472. if (E->isSame(VL) ||
  2473. (UserTreeEntry &&
  2474. (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
  2475. !UserTreeEntry->ShuffleMask[OpdNum].empty() &&
  2476. E->isFoundJumbled(VL, *DL, *SE)))
  2477. return vectorizeTree(E, OpdNum, UserIndx);
  2478. }
  2479. }
  2480. Type *ScalarTy = S.OpValue->getType();
  2481. if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
  2482. ScalarTy = SI->getValueOperand()->getType();
  2483. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  2484. return Gather(VL, VecTy);
  2485. }
  2486. Value *BoUpSLP::vectorizeTree(TreeEntry *E, int OpdNum, int UserIndx) {
  2487. IRBuilder<>::InsertPointGuard Guard(Builder);
  2488. TreeEntry *UserTreeEntry = nullptr;
  2489. if (E->VectorizedValue) {
  2490. DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
  2491. return E->VectorizedValue;
  2492. }
  2493. InstructionsState S = getSameOpcode(E->Scalars);
  2494. Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
  2495. Type *ScalarTy = VL0->getType();
  2496. if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
  2497. ScalarTy = SI->getValueOperand()->getType();
  2498. VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
  2499. if (E->NeedToGather) {
  2500. setInsertPointAfterBundle(E->Scalars, VL0);
  2501. auto *V = Gather(E->Scalars, VecTy);
  2502. E->VectorizedValue = V;
  2503. return V;
  2504. }
  2505. assert(ScalarToTreeEntry.count(E->Scalars[0]) &&
  2506. "Expected user tree entry, missing!");
  2507. int CurrIndx = ScalarToTreeEntry[E->Scalars[0]];
  2508. unsigned ShuffleOrOp = S.IsAltShuffle ?
  2509. (unsigned) Instruction::ShuffleVector : S.Opcode;
  2510. switch (ShuffleOrOp) {
  2511. case Instruction::PHI: {
  2512. PHINode *PH = dyn_cast<PHINode>(VL0);
  2513. Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
  2514. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  2515. PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
  2516. E->VectorizedValue = NewPhi;
  2517. // PHINodes may have multiple entries from the same block. We want to
  2518. // visit every block once.
  2519. SmallSet<BasicBlock*, 4> VisitedBBs;
  2520. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  2521. ValueList Operands;
  2522. BasicBlock *IBB = PH->getIncomingBlock(i);
  2523. if (!VisitedBBs.insert(IBB).second) {
  2524. NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
  2525. continue;
  2526. }
  2527. // Prepare the operand vector.
  2528. for (Value *V : E->Scalars)
  2529. Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
  2530. Builder.SetInsertPoint(IBB->getTerminator());
  2531. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  2532. Value *Vec = vectorizeTree(Operands, i, CurrIndx);
  2533. NewPhi->addIncoming(Vec, IBB);
  2534. }
  2535. assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
  2536. "Invalid number of incoming values");
  2537. return NewPhi;
  2538. }
  2539. case Instruction::ExtractElement: {
  2540. if (canReuseExtract(E->Scalars, VL0)) {
  2541. Value *V = VL0->getOperand(0);
  2542. E->VectorizedValue = V;
  2543. return V;
  2544. }
  2545. setInsertPointAfterBundle(E->Scalars, VL0);
  2546. auto *V = Gather(E->Scalars, VecTy);
  2547. E->VectorizedValue = V;
  2548. return V;
  2549. }
  2550. case Instruction::ExtractValue: {
  2551. if (canReuseExtract(E->Scalars, VL0)) {
  2552. LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
  2553. Builder.SetInsertPoint(LI);
  2554. PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
  2555. Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
  2556. LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
  2557. E->VectorizedValue = V;
  2558. return propagateMetadata(V, E->Scalars);
  2559. }
  2560. setInsertPointAfterBundle(E->Scalars, VL0);
  2561. auto *V = Gather(E->Scalars, VecTy);
  2562. E->VectorizedValue = V;
  2563. return V;
  2564. }
  2565. case Instruction::ZExt:
  2566. case Instruction::SExt:
  2567. case Instruction::FPToUI:
  2568. case Instruction::FPToSI:
  2569. case Instruction::FPExt:
  2570. case Instruction::PtrToInt:
  2571. case Instruction::IntToPtr:
  2572. case Instruction::SIToFP:
  2573. case Instruction::UIToFP:
  2574. case Instruction::Trunc:
  2575. case Instruction::FPTrunc:
  2576. case Instruction::BitCast: {
  2577. ValueList INVL;
  2578. for (Value *V : E->Scalars)
  2579. INVL.push_back(cast<Instruction>(V)->getOperand(0));
  2580. setInsertPointAfterBundle(E->Scalars, VL0);
  2581. Value *InVec = vectorizeTree(INVL, 0, CurrIndx);
  2582. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2583. return V;
  2584. CastInst *CI = dyn_cast<CastInst>(VL0);
  2585. Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
  2586. E->VectorizedValue = V;
  2587. ++NumVectorInstructions;
  2588. return V;
  2589. }
  2590. case Instruction::FCmp:
  2591. case Instruction::ICmp: {
  2592. ValueList LHSV, RHSV;
  2593. for (Value *V : E->Scalars) {
  2594. LHSV.push_back(cast<Instruction>(V)->getOperand(0));
  2595. RHSV.push_back(cast<Instruction>(V)->getOperand(1));
  2596. }
  2597. setInsertPointAfterBundle(E->Scalars, VL0);
  2598. Value *L = vectorizeTree(LHSV, 0, CurrIndx);
  2599. Value *R = vectorizeTree(RHSV, 1, CurrIndx);
  2600. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2601. return V;
  2602. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  2603. Value *V;
  2604. if (S.Opcode == Instruction::FCmp)
  2605. V = Builder.CreateFCmp(P0, L, R);
  2606. else
  2607. V = Builder.CreateICmp(P0, L, R);
  2608. E->VectorizedValue = V;
  2609. propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
  2610. ++NumVectorInstructions;
  2611. return V;
  2612. }
  2613. case Instruction::Select: {
  2614. ValueList TrueVec, FalseVec, CondVec;
  2615. for (Value *V : E->Scalars) {
  2616. CondVec.push_back(cast<Instruction>(V)->getOperand(0));
  2617. TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
  2618. FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
  2619. }
  2620. setInsertPointAfterBundle(E->Scalars, VL0);
  2621. Value *Cond = vectorizeTree(CondVec, 0, CurrIndx);
  2622. Value *True = vectorizeTree(TrueVec, 1, CurrIndx);
  2623. Value *False = vectorizeTree(FalseVec, 2, CurrIndx);
  2624. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2625. return V;
  2626. Value *V = Builder.CreateSelect(Cond, True, False);
  2627. E->VectorizedValue = V;
  2628. ++NumVectorInstructions;
  2629. return V;
  2630. }
  2631. case Instruction::Add:
  2632. case Instruction::FAdd:
  2633. case Instruction::Sub:
  2634. case Instruction::FSub:
  2635. case Instruction::Mul:
  2636. case Instruction::FMul:
  2637. case Instruction::UDiv:
  2638. case Instruction::SDiv:
  2639. case Instruction::FDiv:
  2640. case Instruction::URem:
  2641. case Instruction::SRem:
  2642. case Instruction::FRem:
  2643. case Instruction::Shl:
  2644. case Instruction::LShr:
  2645. case Instruction::AShr:
  2646. case Instruction::And:
  2647. case Instruction::Or:
  2648. case Instruction::Xor: {
  2649. ValueList LHSVL, RHSVL;
  2650. if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
  2651. reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL,
  2652. RHSVL);
  2653. else
  2654. for (Value *V : E->Scalars) {
  2655. auto *I = cast<Instruction>(V);
  2656. LHSVL.push_back(I->getOperand(0));
  2657. RHSVL.push_back(I->getOperand(1));
  2658. }
  2659. setInsertPointAfterBundle(E->Scalars, VL0);
  2660. Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
  2661. Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
  2662. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2663. return V;
  2664. Value *V = Builder.CreateBinOp(
  2665. static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
  2666. E->VectorizedValue = V;
  2667. propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
  2668. ++NumVectorInstructions;
  2669. if (Instruction *I = dyn_cast<Instruction>(V))
  2670. return propagateMetadata(I, E->Scalars);
  2671. return V;
  2672. }
  2673. case Instruction::Load: {
  2674. // Loads are inserted at the head of the tree because we don't want to
  2675. // sink them all the way down past store instructions.
  2676. setInsertPointAfterBundle(E->Scalars, VL0);
  2677. if (UserIndx != -1)
  2678. UserTreeEntry = &VectorizableTree[UserIndx];
  2679. bool isJumbled = false;
  2680. LoadInst *LI = NULL;
  2681. if (UserTreeEntry &&
  2682. (unsigned)OpdNum < UserTreeEntry->ShuffleMask.size() &&
  2683. !UserTreeEntry->ShuffleMask[OpdNum].empty()) {
  2684. isJumbled = true;
  2685. LI = cast<LoadInst>(E->Scalars[0]);
  2686. } else {
  2687. LI = cast<LoadInst>(VL0);
  2688. }
  2689. Type *ScalarLoadTy = LI->getType();
  2690. unsigned AS = LI->getPointerAddressSpace();
  2691. Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
  2692. VecTy->getPointerTo(AS));
  2693. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2694. // ExternalUses list to make sure that an extract will be generated in the
  2695. // future.
  2696. Value *PO = LI->getPointerOperand();
  2697. if (getTreeEntry(PO))
  2698. ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
  2699. unsigned Alignment = LI->getAlignment();
  2700. LI = Builder.CreateLoad(VecPtr);
  2701. if (!Alignment) {
  2702. Alignment = DL->getABITypeAlignment(ScalarLoadTy);
  2703. }
  2704. LI->setAlignment(Alignment);
  2705. E->VectorizedValue = LI;
  2706. ++NumVectorInstructions;
  2707. propagateMetadata(LI, E->Scalars);
  2708. if (isJumbled) {
  2709. SmallVector<Constant *, 8> Mask;
  2710. for (unsigned LaneEntry : UserTreeEntry->ShuffleMask[OpdNum])
  2711. Mask.push_back(Builder.getInt32(LaneEntry));
  2712. // Generate shuffle for jumbled memory access
  2713. Value *Undef = UndefValue::get(VecTy);
  2714. Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
  2715. ConstantVector::get(Mask));
  2716. E->VectorizedValue = Shuf;
  2717. ++NumVectorInstructions;
  2718. return Shuf;
  2719. }
  2720. return LI;
  2721. }
  2722. case Instruction::Store: {
  2723. StoreInst *SI = cast<StoreInst>(VL0);
  2724. unsigned Alignment = SI->getAlignment();
  2725. unsigned AS = SI->getPointerAddressSpace();
  2726. ValueList ScalarStoreValues;
  2727. for (Value *V : E->Scalars)
  2728. ScalarStoreValues.push_back(cast<StoreInst>(V)->getValueOperand());
  2729. setInsertPointAfterBundle(E->Scalars, VL0);
  2730. Value *VecValue = vectorizeTree(ScalarStoreValues, 0, CurrIndx);
  2731. Value *ScalarPtr = SI->getPointerOperand();
  2732. Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS));
  2733. StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
  2734. // The pointer operand uses an in-tree scalar, so add the new BitCast to
  2735. // ExternalUses to make sure that an extract will be generated in the
  2736. // future.
  2737. if (getTreeEntry(ScalarPtr))
  2738. ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
  2739. if (!Alignment)
  2740. Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
  2741. S->setAlignment(Alignment);
  2742. E->VectorizedValue = S;
  2743. ++NumVectorInstructions;
  2744. return propagateMetadata(S, E->Scalars);
  2745. }
  2746. case Instruction::GetElementPtr: {
  2747. setInsertPointAfterBundle(E->Scalars, VL0);
  2748. ValueList Op0VL;
  2749. for (Value *V : E->Scalars)
  2750. Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
  2751. Value *Op0 = vectorizeTree(Op0VL, 0, CurrIndx);
  2752. std::vector<Value *> OpVecs;
  2753. for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
  2754. ++j) {
  2755. ValueList OpVL;
  2756. for (Value *V : E->Scalars)
  2757. OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
  2758. Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
  2759. OpVecs.push_back(OpVec);
  2760. }
  2761. Value *V = Builder.CreateGEP(
  2762. cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
  2763. E->VectorizedValue = V;
  2764. ++NumVectorInstructions;
  2765. if (Instruction *I = dyn_cast<Instruction>(V))
  2766. return propagateMetadata(I, E->Scalars);
  2767. return V;
  2768. }
  2769. case Instruction::Call: {
  2770. CallInst *CI = cast<CallInst>(VL0);
  2771. setInsertPointAfterBundle(E->Scalars, VL0);
  2772. Function *FI;
  2773. Intrinsic::ID IID = Intrinsic::not_intrinsic;
  2774. Value *ScalarArg = nullptr;
  2775. if (CI && (FI = CI->getCalledFunction())) {
  2776. IID = FI->getIntrinsicID();
  2777. }
  2778. std::vector<Value *> OpVecs;
  2779. for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
  2780. ValueList OpVL;
  2781. // ctlz,cttz and powi are special intrinsics whose second argument is
  2782. // a scalar. This argument should not be vectorized.
  2783. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
  2784. CallInst *CEI = cast<CallInst>(VL0);
  2785. ScalarArg = CEI->getArgOperand(j);
  2786. OpVecs.push_back(CEI->getArgOperand(j));
  2787. continue;
  2788. }
  2789. for (Value *V : E->Scalars) {
  2790. CallInst *CEI = cast<CallInst>(V);
  2791. OpVL.push_back(CEI->getArgOperand(j));
  2792. }
  2793. Value *OpVec = vectorizeTree(OpVL, j, CurrIndx);
  2794. DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
  2795. OpVecs.push_back(OpVec);
  2796. }
  2797. Module *M = F->getParent();
  2798. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  2799. Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
  2800. Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
  2801. SmallVector<OperandBundleDef, 1> OpBundles;
  2802. CI->getOperandBundlesAsDefs(OpBundles);
  2803. Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
  2804. // The scalar argument uses an in-tree scalar so we add the new vectorized
  2805. // call to ExternalUses list to make sure that an extract will be
  2806. // generated in the future.
  2807. if (ScalarArg && getTreeEntry(ScalarArg))
  2808. ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
  2809. E->VectorizedValue = V;
  2810. propagateIRFlags(E->VectorizedValue, E->Scalars, VL0);
  2811. ++NumVectorInstructions;
  2812. return V;
  2813. }
  2814. case Instruction::ShuffleVector: {
  2815. ValueList LHSVL, RHSVL;
  2816. assert(Instruction::isBinaryOp(S.Opcode) &&
  2817. "Invalid Shuffle Vector Operand");
  2818. reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL);
  2819. setInsertPointAfterBundle(E->Scalars, VL0);
  2820. Value *LHS = vectorizeTree(LHSVL, 0, CurrIndx);
  2821. Value *RHS = vectorizeTree(RHSVL, 1, CurrIndx);
  2822. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2823. return V;
  2824. // Create a vector of LHS op1 RHS
  2825. Value *V0 = Builder.CreateBinOp(
  2826. static_cast<Instruction::BinaryOps>(S.Opcode), LHS, RHS);
  2827. unsigned AltOpcode = getAltOpcode(S.Opcode);
  2828. // Create a vector of LHS op2 RHS
  2829. Value *V1 = Builder.CreateBinOp(
  2830. static_cast<Instruction::BinaryOps>(AltOpcode), LHS, RHS);
  2831. // Create shuffle to take alternate operations from the vector.
  2832. // Also, gather up odd and even scalar ops to propagate IR flags to
  2833. // each vector operation.
  2834. ValueList OddScalars, EvenScalars;
  2835. unsigned e = E->Scalars.size();
  2836. SmallVector<Constant *, 8> Mask(e);
  2837. for (unsigned i = 0; i < e; ++i) {
  2838. if (isOdd(i)) {
  2839. Mask[i] = Builder.getInt32(e + i);
  2840. OddScalars.push_back(E->Scalars[i]);
  2841. } else {
  2842. Mask[i] = Builder.getInt32(i);
  2843. EvenScalars.push_back(E->Scalars[i]);
  2844. }
  2845. }
  2846. Value *ShuffleMask = ConstantVector::get(Mask);
  2847. propagateIRFlags(V0, EvenScalars);
  2848. propagateIRFlags(V1, OddScalars);
  2849. Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
  2850. E->VectorizedValue = V;
  2851. ++NumVectorInstructions;
  2852. if (Instruction *I = dyn_cast<Instruction>(V))
  2853. return propagateMetadata(I, E->Scalars);
  2854. return V;
  2855. }
  2856. default:
  2857. llvm_unreachable("unknown inst");
  2858. }
  2859. return nullptr;
  2860. }
  2861. Value *BoUpSLP::vectorizeTree() {
  2862. ExtraValueToDebugLocsMap ExternallyUsedValues;
  2863. return vectorizeTree(ExternallyUsedValues);
  2864. }
  2865. Value *
  2866. BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
  2867. // All blocks must be scheduled before any instructions are inserted.
  2868. for (auto &BSIter : BlocksSchedules) {
  2869. scheduleBlock(BSIter.second.get());
  2870. }
  2871. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2872. auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
  2873. // If the vectorized tree can be rewritten in a smaller type, we truncate the
  2874. // vectorized root. InstCombine will then rewrite the entire expression. We
  2875. // sign extend the extracted values below.
  2876. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  2877. if (MinBWs.count(ScalarRoot)) {
  2878. if (auto *I = dyn_cast<Instruction>(VectorRoot))
  2879. Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
  2880. auto BundleWidth = VectorizableTree[0].Scalars.size();
  2881. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
  2882. auto *VecTy = VectorType::get(MinTy, BundleWidth);
  2883. auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
  2884. VectorizableTree[0].VectorizedValue = Trunc;
  2885. }
  2886. DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
  2887. // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
  2888. // specified by ScalarType.
  2889. auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
  2890. if (!MinBWs.count(ScalarRoot))
  2891. return Ex;
  2892. if (MinBWs[ScalarRoot].second)
  2893. return Builder.CreateSExt(Ex, ScalarType);
  2894. return Builder.CreateZExt(Ex, ScalarType);
  2895. };
  2896. // Extract all of the elements with the external uses.
  2897. for (const auto &ExternalUse : ExternalUses) {
  2898. Value *Scalar = ExternalUse.Scalar;
  2899. llvm::User *User = ExternalUse.User;
  2900. // Skip users that we already RAUW. This happens when one instruction
  2901. // has multiple uses of the same value.
  2902. if (User && !is_contained(Scalar->users(), User))
  2903. continue;
  2904. TreeEntry *E = getTreeEntry(Scalar);
  2905. assert(E && "Invalid scalar");
  2906. assert((!E->NeedToGather) && "Extracting from a gather list");
  2907. Value *Vec = dyn_cast<ShuffleVectorInst>(E->VectorizedValue);
  2908. if (Vec && dyn_cast<LoadInst>(cast<Instruction>(Vec)->getOperand(0))) {
  2909. Vec = cast<Instruction>(E->VectorizedValue)->getOperand(0);
  2910. } else {
  2911. Vec = E->VectorizedValue;
  2912. }
  2913. assert(Vec && "Can't find vectorizable value");
  2914. Value *Lane = Builder.getInt32(ExternalUse.Lane);
  2915. // If User == nullptr, the Scalar is used as extra arg. Generate
  2916. // ExtractElement instruction and update the record for this scalar in
  2917. // ExternallyUsedValues.
  2918. if (!User) {
  2919. assert(ExternallyUsedValues.count(Scalar) &&
  2920. "Scalar with nullptr as an external user must be registered in "
  2921. "ExternallyUsedValues map");
  2922. if (auto *VecI = dyn_cast<Instruction>(Vec)) {
  2923. Builder.SetInsertPoint(VecI->getParent(),
  2924. std::next(VecI->getIterator()));
  2925. } else {
  2926. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2927. }
  2928. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2929. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2930. CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
  2931. auto &Locs = ExternallyUsedValues[Scalar];
  2932. ExternallyUsedValues.insert({Ex, Locs});
  2933. ExternallyUsedValues.erase(Scalar);
  2934. continue;
  2935. }
  2936. // Generate extracts for out-of-tree users.
  2937. // Find the insertion point for the extractelement lane.
  2938. if (auto *VecI = dyn_cast<Instruction>(Vec)) {
  2939. if (PHINode *PH = dyn_cast<PHINode>(User)) {
  2940. for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
  2941. if (PH->getIncomingValue(i) == Scalar) {
  2942. TerminatorInst *IncomingTerminator =
  2943. PH->getIncomingBlock(i)->getTerminator();
  2944. if (isa<CatchSwitchInst>(IncomingTerminator)) {
  2945. Builder.SetInsertPoint(VecI->getParent(),
  2946. std::next(VecI->getIterator()));
  2947. } else {
  2948. Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
  2949. }
  2950. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2951. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2952. CSEBlocks.insert(PH->getIncomingBlock(i));
  2953. PH->setOperand(i, Ex);
  2954. }
  2955. }
  2956. } else {
  2957. Builder.SetInsertPoint(cast<Instruction>(User));
  2958. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2959. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2960. CSEBlocks.insert(cast<Instruction>(User)->getParent());
  2961. User->replaceUsesOfWith(Scalar, Ex);
  2962. }
  2963. } else {
  2964. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2965. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2966. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2967. CSEBlocks.insert(&F->getEntryBlock());
  2968. User->replaceUsesOfWith(Scalar, Ex);
  2969. }
  2970. DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
  2971. }
  2972. // For each vectorized value:
  2973. for (TreeEntry &EIdx : VectorizableTree) {
  2974. TreeEntry *Entry = &EIdx;
  2975. // No need to handle users of gathered values.
  2976. if (Entry->NeedToGather)
  2977. continue;
  2978. assert(Entry->VectorizedValue && "Can't find vectorizable value");
  2979. // For each lane:
  2980. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  2981. Value *Scalar = Entry->Scalars[Lane];
  2982. Type *Ty = Scalar->getType();
  2983. if (!Ty->isVoidTy()) {
  2984. #ifndef NDEBUG
  2985. for (User *U : Scalar->users()) {
  2986. DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
  2987. // It is legal to replace users in the ignorelist by undef.
  2988. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
  2989. "Replacing out-of-tree value with undef");
  2990. }
  2991. #endif
  2992. Value *Undef = UndefValue::get(Ty);
  2993. Scalar->replaceAllUsesWith(Undef);
  2994. }
  2995. DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
  2996. eraseInstruction(cast<Instruction>(Scalar));
  2997. }
  2998. }
  2999. Builder.ClearInsertionPoint();
  3000. return VectorizableTree[0].VectorizedValue;
  3001. }
  3002. void BoUpSLP::optimizeGatherSequence(Function &F) {
  3003. DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
  3004. << " gather sequences instructions.\n");
  3005. // LICM InsertElementInst sequences.
  3006. for (Instruction *it : GatherSeq) {
  3007. InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
  3008. if (!Insert)
  3009. continue;
  3010. // Check if this block is inside a loop.
  3011. Loop *L = LI->getLoopFor(Insert->getParent());
  3012. if (!L)
  3013. continue;
  3014. // Check if it has a preheader.
  3015. BasicBlock *PreHeader = L->getLoopPreheader();
  3016. if (!PreHeader)
  3017. continue;
  3018. // If the vector or the element that we insert into it are
  3019. // instructions that are defined in this basic block then we can't
  3020. // hoist this instruction.
  3021. Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
  3022. Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
  3023. if (CurrVec && L->contains(CurrVec))
  3024. continue;
  3025. if (NewElem && L->contains(NewElem))
  3026. continue;
  3027. // We can hoist this instruction. Move it to the pre-header.
  3028. Insert->moveBefore(PreHeader->getTerminator());
  3029. }
  3030. // Perform O(N^2) search over the gather sequences and merge identical
  3031. // instructions. TODO: We can further optimize this scan if we split the
  3032. // instructions into different buckets based on the insert lane.
  3033. SmallVector<Instruction *, 16> Visited;
  3034. ReversePostOrderTraversal<Function *> RPOT(&F);
  3035. for (auto BB : RPOT) {
  3036. // Traverse CSEBlocks by RPOT order.
  3037. if (!CSEBlocks.count(BB))
  3038. continue;
  3039. // For all instructions in blocks containing gather sequences:
  3040. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
  3041. Instruction *In = &*it++;
  3042. if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
  3043. continue;
  3044. // Check if we can replace this instruction with any of the
  3045. // visited instructions.
  3046. for (Instruction *v : Visited) {
  3047. if (In->isIdenticalTo(v) &&
  3048. DT->dominates(v->getParent(), In->getParent())) {
  3049. In->replaceAllUsesWith(v);
  3050. eraseInstruction(In);
  3051. In = nullptr;
  3052. break;
  3053. }
  3054. }
  3055. if (In) {
  3056. assert(!is_contained(Visited, In));
  3057. Visited.push_back(In);
  3058. }
  3059. }
  3060. }
  3061. CSEBlocks.clear();
  3062. GatherSeq.clear();
  3063. }
  3064. // Groups the instructions to a bundle (which is then a single scheduling entity)
  3065. // and schedules instructions until the bundle gets ready.
  3066. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
  3067. BoUpSLP *SLP, Value *OpValue) {
  3068. if (isa<PHINode>(OpValue))
  3069. return true;
  3070. // Initialize the instruction bundle.
  3071. Instruction *OldScheduleEnd = ScheduleEnd;
  3072. ScheduleData *PrevInBundle = nullptr;
  3073. ScheduleData *Bundle = nullptr;
  3074. bool ReSchedule = false;
  3075. DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");
  3076. // Make sure that the scheduling region contains all
  3077. // instructions of the bundle.
  3078. for (Value *V : VL) {
  3079. if (!extendSchedulingRegion(V, OpValue))
  3080. return false;
  3081. }
  3082. for (Value *V : VL) {
  3083. ScheduleData *BundleMember = getScheduleData(V);
  3084. assert(BundleMember &&
  3085. "no ScheduleData for bundle member (maybe not in same basic block)");
  3086. if (BundleMember->IsScheduled) {
  3087. // A bundle member was scheduled as single instruction before and now
  3088. // needs to be scheduled as part of the bundle. We just get rid of the
  3089. // existing schedule.
  3090. DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
  3091. << " was already scheduled\n");
  3092. ReSchedule = true;
  3093. }
  3094. assert(BundleMember->isSchedulingEntity() &&
  3095. "bundle member already part of other bundle");
  3096. if (PrevInBundle) {
  3097. PrevInBundle->NextInBundle = BundleMember;
  3098. } else {
  3099. Bundle = BundleMember;
  3100. }
  3101. BundleMember->UnscheduledDepsInBundle = 0;
  3102. Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
  3103. // Group the instructions to a bundle.
  3104. BundleMember->FirstInBundle = Bundle;
  3105. PrevInBundle = BundleMember;
  3106. }
  3107. if (ScheduleEnd != OldScheduleEnd) {
  3108. // The scheduling region got new instructions at the lower end (or it is a
  3109. // new region for the first bundle). This makes it necessary to
  3110. // recalculate all dependencies.
  3111. // It is seldom that this needs to be done a second time after adding the
  3112. // initial bundle to the region.
  3113. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  3114. doForAllOpcodes(I, [](ScheduleData *SD) {
  3115. SD->clearDependencies();
  3116. });
  3117. }
  3118. ReSchedule = true;
  3119. }
  3120. if (ReSchedule) {
  3121. resetSchedule();
  3122. initialFillReadyList(ReadyInsts);
  3123. }
  3124. DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
  3125. << BB->getName() << "\n");
  3126. calculateDependencies(Bundle, true, SLP);
  3127. // Now try to schedule the new bundle. As soon as the bundle is "ready" it
  3128. // means that there are no cyclic dependencies and we can schedule it.
  3129. // Note that's important that we don't "schedule" the bundle yet (see
  3130. // cancelScheduling).
  3131. while (!Bundle->isReady() && !ReadyInsts.empty()) {
  3132. ScheduleData *pickedSD = ReadyInsts.back();
  3133. ReadyInsts.pop_back();
  3134. if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
  3135. schedule(pickedSD, ReadyInsts);
  3136. }
  3137. }
  3138. if (!Bundle->isReady()) {
  3139. cancelScheduling(VL, OpValue);
  3140. return false;
  3141. }
  3142. return true;
  3143. }
  3144. void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
  3145. Value *OpValue) {
  3146. if (isa<PHINode>(OpValue))
  3147. return;
  3148. ScheduleData *Bundle = getScheduleData(OpValue);
  3149. DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
  3150. assert(!Bundle->IsScheduled &&
  3151. "Can't cancel bundle which is already scheduled");
  3152. assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
  3153. "tried to unbundle something which is not a bundle");
  3154. // Un-bundle: make single instructions out of the bundle.
  3155. ScheduleData *BundleMember = Bundle;
  3156. while (BundleMember) {
  3157. assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
  3158. BundleMember->FirstInBundle = BundleMember;
  3159. ScheduleData *Next = BundleMember->NextInBundle;
  3160. BundleMember->NextInBundle = nullptr;
  3161. BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
  3162. if (BundleMember->UnscheduledDepsInBundle == 0) {
  3163. ReadyInsts.insert(BundleMember);
  3164. }
  3165. BundleMember = Next;
  3166. }
  3167. }
  3168. BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
  3169. // Allocate a new ScheduleData for the instruction.
  3170. if (ChunkPos >= ChunkSize) {
  3171. ScheduleDataChunks.push_back(llvm::make_unique<ScheduleData[]>(ChunkSize));
  3172. ChunkPos = 0;
  3173. }
  3174. return &(ScheduleDataChunks.back()[ChunkPos++]);
  3175. }
  3176. bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
  3177. Value *OpValue) {
  3178. if (getScheduleData(V, isOneOf(OpValue, V)))
  3179. return true;
  3180. Instruction *I = dyn_cast<Instruction>(V);
  3181. assert(I && "bundle member must be an instruction");
  3182. assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
  3183. auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool {
  3184. ScheduleData *ISD = getScheduleData(I);
  3185. if (!ISD)
  3186. return false;
  3187. assert(isInSchedulingRegion(ISD) &&
  3188. "ScheduleData not in scheduling region");
  3189. ScheduleData *SD = allocateScheduleDataChunks();
  3190. SD->Inst = I;
  3191. SD->init(SchedulingRegionID, OpValue);
  3192. ExtraScheduleDataMap[I][OpValue] = SD;
  3193. return true;
  3194. };
  3195. if (CheckSheduleForI(I))
  3196. return true;
  3197. if (!ScheduleStart) {
  3198. // It's the first instruction in the new region.
  3199. initScheduleData(I, I->getNextNode(), nullptr, nullptr);
  3200. ScheduleStart = I;
  3201. ScheduleEnd = I->getNextNode();
  3202. if (isOneOf(OpValue, I) != I)
  3203. CheckSheduleForI(I);
  3204. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  3205. DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
  3206. return true;
  3207. }
  3208. // Search up and down at the same time, because we don't know if the new
  3209. // instruction is above or below the existing scheduling region.
  3210. BasicBlock::reverse_iterator UpIter =
  3211. ++ScheduleStart->getIterator().getReverse();
  3212. BasicBlock::reverse_iterator UpperEnd = BB->rend();
  3213. BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
  3214. BasicBlock::iterator LowerEnd = BB->end();
  3215. while (true) {
  3216. if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
  3217. DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
  3218. return false;
  3219. }
  3220. if (UpIter != UpperEnd) {
  3221. if (&*UpIter == I) {
  3222. initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
  3223. ScheduleStart = I;
  3224. if (isOneOf(OpValue, I) != I)
  3225. CheckSheduleForI(I);
  3226. DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
  3227. return true;
  3228. }
  3229. UpIter++;
  3230. }
  3231. if (DownIter != LowerEnd) {
  3232. if (&*DownIter == I) {
  3233. initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
  3234. nullptr);
  3235. ScheduleEnd = I->getNextNode();
  3236. if (isOneOf(OpValue, I) != I)
  3237. CheckSheduleForI(I);
  3238. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  3239. DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
  3240. return true;
  3241. }
  3242. DownIter++;
  3243. }
  3244. assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
  3245. "instruction not found in block");
  3246. }
  3247. return true;
  3248. }
  3249. void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
  3250. Instruction *ToI,
  3251. ScheduleData *PrevLoadStore,
  3252. ScheduleData *NextLoadStore) {
  3253. ScheduleData *CurrentLoadStore = PrevLoadStore;
  3254. for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
  3255. ScheduleData *SD = ScheduleDataMap[I];
  3256. if (!SD) {
  3257. SD = allocateScheduleDataChunks();
  3258. ScheduleDataMap[I] = SD;
  3259. SD->Inst = I;
  3260. }
  3261. assert(!isInSchedulingRegion(SD) &&
  3262. "new ScheduleData already in scheduling region");
  3263. SD->init(SchedulingRegionID, I);
  3264. if (I->mayReadOrWriteMemory() &&
  3265. (!isa<IntrinsicInst>(I) ||
  3266. cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect)) {
  3267. // Update the linked list of memory accessing instructions.
  3268. if (CurrentLoadStore) {
  3269. CurrentLoadStore->NextLoadStore = SD;
  3270. } else {
  3271. FirstLoadStoreInRegion = SD;
  3272. }
  3273. CurrentLoadStore = SD;
  3274. }
  3275. }
  3276. if (NextLoadStore) {
  3277. if (CurrentLoadStore)
  3278. CurrentLoadStore->NextLoadStore = NextLoadStore;
  3279. } else {
  3280. LastLoadStoreInRegion = CurrentLoadStore;
  3281. }
  3282. }
  3283. void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
  3284. bool InsertInReadyList,
  3285. BoUpSLP *SLP) {
  3286. assert(SD->isSchedulingEntity());
  3287. SmallVector<ScheduleData *, 10> WorkList;
  3288. WorkList.push_back(SD);
  3289. while (!WorkList.empty()) {
  3290. ScheduleData *SD = WorkList.back();
  3291. WorkList.pop_back();
  3292. ScheduleData *BundleMember = SD;
  3293. while (BundleMember) {
  3294. assert(isInSchedulingRegion(BundleMember));
  3295. if (!BundleMember->hasValidDependencies()) {
  3296. DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
  3297. BundleMember->Dependencies = 0;
  3298. BundleMember->resetUnscheduledDeps();
  3299. // Handle def-use chain dependencies.
  3300. if (BundleMember->OpValue != BundleMember->Inst) {
  3301. ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
  3302. if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
  3303. BundleMember->Dependencies++;
  3304. ScheduleData *DestBundle = UseSD->FirstInBundle;
  3305. if (!DestBundle->IsScheduled)
  3306. BundleMember->incrementUnscheduledDeps(1);
  3307. if (!DestBundle->hasValidDependencies())
  3308. WorkList.push_back(DestBundle);
  3309. }
  3310. } else {
  3311. for (User *U : BundleMember->Inst->users()) {
  3312. if (isa<Instruction>(U)) {
  3313. ScheduleData *UseSD = getScheduleData(U);
  3314. if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
  3315. BundleMember->Dependencies++;
  3316. ScheduleData *DestBundle = UseSD->FirstInBundle;
  3317. if (!DestBundle->IsScheduled)
  3318. BundleMember->incrementUnscheduledDeps(1);
  3319. if (!DestBundle->hasValidDependencies())
  3320. WorkList.push_back(DestBundle);
  3321. }
  3322. } else {
  3323. // I'm not sure if this can ever happen. But we need to be safe.
  3324. // This lets the instruction/bundle never be scheduled and
  3325. // eventually disable vectorization.
  3326. BundleMember->Dependencies++;
  3327. BundleMember->incrementUnscheduledDeps(1);
  3328. }
  3329. }
  3330. }
  3331. // Handle the memory dependencies.
  3332. ScheduleData *DepDest = BundleMember->NextLoadStore;
  3333. if (DepDest) {
  3334. Instruction *SrcInst = BundleMember->Inst;
  3335. MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
  3336. bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
  3337. unsigned numAliased = 0;
  3338. unsigned DistToSrc = 1;
  3339. while (DepDest) {
  3340. assert(isInSchedulingRegion(DepDest));
  3341. // We have two limits to reduce the complexity:
  3342. // 1) AliasedCheckLimit: It's a small limit to reduce calls to
  3343. // SLP->isAliased (which is the expensive part in this loop).
  3344. // 2) MaxMemDepDistance: It's for very large blocks and it aborts
  3345. // the whole loop (even if the loop is fast, it's quadratic).
  3346. // It's important for the loop break condition (see below) to
  3347. // check this limit even between two read-only instructions.
  3348. if (DistToSrc >= MaxMemDepDistance ||
  3349. ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
  3350. (numAliased >= AliasedCheckLimit ||
  3351. SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
  3352. // We increment the counter only if the locations are aliased
  3353. // (instead of counting all alias checks). This gives a better
  3354. // balance between reduced runtime and accurate dependencies.
  3355. numAliased++;
  3356. DepDest->MemoryDependencies.push_back(BundleMember);
  3357. BundleMember->Dependencies++;
  3358. ScheduleData *DestBundle = DepDest->FirstInBundle;
  3359. if (!DestBundle->IsScheduled) {
  3360. BundleMember->incrementUnscheduledDeps(1);
  3361. }
  3362. if (!DestBundle->hasValidDependencies()) {
  3363. WorkList.push_back(DestBundle);
  3364. }
  3365. }
  3366. DepDest = DepDest->NextLoadStore;
  3367. // Example, explaining the loop break condition: Let's assume our
  3368. // starting instruction is i0 and MaxMemDepDistance = 3.
  3369. //
  3370. // +--------v--v--v
  3371. // i0,i1,i2,i3,i4,i5,i6,i7,i8
  3372. // +--------^--^--^
  3373. //
  3374. // MaxMemDepDistance let us stop alias-checking at i3 and we add
  3375. // dependencies from i0 to i3,i4,.. (even if they are not aliased).
  3376. // Previously we already added dependencies from i3 to i6,i7,i8
  3377. // (because of MaxMemDepDistance). As we added a dependency from
  3378. // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
  3379. // and we can abort this loop at i6.
  3380. if (DistToSrc >= 2 * MaxMemDepDistance)
  3381. break;
  3382. DistToSrc++;
  3383. }
  3384. }
  3385. }
  3386. BundleMember = BundleMember->NextInBundle;
  3387. }
  3388. if (InsertInReadyList && SD->isReady()) {
  3389. ReadyInsts.push_back(SD);
  3390. DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
  3391. }
  3392. }
  3393. }
  3394. void BoUpSLP::BlockScheduling::resetSchedule() {
  3395. assert(ScheduleStart &&
  3396. "tried to reset schedule on block which has not been scheduled");
  3397. for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  3398. doForAllOpcodes(I, [&](ScheduleData *SD) {
  3399. assert(isInSchedulingRegion(SD) &&
  3400. "ScheduleData not in scheduling region");
  3401. SD->IsScheduled = false;
  3402. SD->resetUnscheduledDeps();
  3403. });
  3404. }
  3405. ReadyInsts.clear();
  3406. }
  3407. void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
  3408. if (!BS->ScheduleStart)
  3409. return;
  3410. DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
  3411. BS->resetSchedule();
  3412. // For the real scheduling we use a more sophisticated ready-list: it is
  3413. // sorted by the original instruction location. This lets the final schedule
  3414. // be as close as possible to the original instruction order.
  3415. struct ScheduleDataCompare {
  3416. bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
  3417. return SD2->SchedulingPriority < SD1->SchedulingPriority;
  3418. }
  3419. };
  3420. std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
  3421. // Ensure that all dependency data is updated and fill the ready-list with
  3422. // initial instructions.
  3423. int Idx = 0;
  3424. int NumToSchedule = 0;
  3425. for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
  3426. I = I->getNextNode()) {
  3427. BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
  3428. assert(SD->isPartOfBundle() ==
  3429. (getTreeEntry(SD->Inst) != nullptr) &&
  3430. "scheduler and vectorizer bundle mismatch");
  3431. SD->FirstInBundle->SchedulingPriority = Idx++;
  3432. if (SD->isSchedulingEntity()) {
  3433. BS->calculateDependencies(SD, false, this);
  3434. NumToSchedule++;
  3435. }
  3436. });
  3437. }
  3438. BS->initialFillReadyList(ReadyInsts);
  3439. Instruction *LastScheduledInst = BS->ScheduleEnd;
  3440. // Do the "real" scheduling.
  3441. while (!ReadyInsts.empty()) {
  3442. ScheduleData *picked = *ReadyInsts.begin();
  3443. ReadyInsts.erase(ReadyInsts.begin());
  3444. // Move the scheduled instruction(s) to their dedicated places, if not
  3445. // there yet.
  3446. ScheduleData *BundleMember = picked;
  3447. while (BundleMember) {
  3448. Instruction *pickedInst = BundleMember->Inst;
  3449. if (LastScheduledInst->getNextNode() != pickedInst) {
  3450. BS->BB->getInstList().remove(pickedInst);
  3451. BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
  3452. pickedInst);
  3453. }
  3454. LastScheduledInst = pickedInst;
  3455. BundleMember = BundleMember->NextInBundle;
  3456. }
  3457. BS->schedule(picked, ReadyInsts);
  3458. NumToSchedule--;
  3459. }
  3460. assert(NumToSchedule == 0 && "could not schedule all instructions");
  3461. // Avoid duplicate scheduling of the block.
  3462. BS->ScheduleStart = nullptr;
  3463. }
  3464. unsigned BoUpSLP::getVectorElementSize(Value *V) {
  3465. // If V is a store, just return the width of the stored value without
  3466. // traversing the expression tree. This is the common case.
  3467. if (auto *Store = dyn_cast<StoreInst>(V))
  3468. return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
  3469. // If V is not a store, we can traverse the expression tree to find loads
  3470. // that feed it. The type of the loaded value may indicate a more suitable
  3471. // width than V's type. We want to base the vector element size on the width
  3472. // of memory operations where possible.
  3473. SmallVector<Instruction *, 16> Worklist;
  3474. SmallPtrSet<Instruction *, 16> Visited;
  3475. if (auto *I = dyn_cast<Instruction>(V))
  3476. Worklist.push_back(I);
  3477. // Traverse the expression tree in bottom-up order looking for loads. If we
  3478. // encounter an instruciton we don't yet handle, we give up.
  3479. auto MaxWidth = 0u;
  3480. auto FoundUnknownInst = false;
  3481. while (!Worklist.empty() && !FoundUnknownInst) {
  3482. auto *I = Worklist.pop_back_val();
  3483. Visited.insert(I);
  3484. // We should only be looking at scalar instructions here. If the current
  3485. // instruction has a vector type, give up.
  3486. auto *Ty = I->getType();
  3487. if (isa<VectorType>(Ty))
  3488. FoundUnknownInst = true;
  3489. // If the current instruction is a load, update MaxWidth to reflect the
  3490. // width of the loaded value.
  3491. else if (isa<LoadInst>(I))
  3492. MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
  3493. // Otherwise, we need to visit the operands of the instruction. We only
  3494. // handle the interesting cases from buildTree here. If an operand is an
  3495. // instruction we haven't yet visited, we add it to the worklist.
  3496. else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
  3497. isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
  3498. for (Use &U : I->operands())
  3499. if (auto *J = dyn_cast<Instruction>(U.get()))
  3500. if (!Visited.count(J))
  3501. Worklist.push_back(J);
  3502. }
  3503. // If we don't yet handle the instruction, give up.
  3504. else
  3505. FoundUnknownInst = true;
  3506. }
  3507. // If we didn't encounter a memory access in the expression tree, or if we
  3508. // gave up for some reason, just return the width of V.
  3509. if (!MaxWidth || FoundUnknownInst)
  3510. return DL->getTypeSizeInBits(V->getType());
  3511. // Otherwise, return the maximum width we found.
  3512. return MaxWidth;
  3513. }
  3514. // Determine if a value V in a vectorizable expression Expr can be demoted to a
  3515. // smaller type with a truncation. We collect the values that will be demoted
  3516. // in ToDemote and additional roots that require investigating in Roots.
  3517. static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
  3518. SmallVectorImpl<Value *> &ToDemote,
  3519. SmallVectorImpl<Value *> &Roots) {
  3520. // We can always demote constants.
  3521. if (isa<Constant>(V)) {
  3522. ToDemote.push_back(V);
  3523. return true;
  3524. }
  3525. // If the value is not an instruction in the expression with only one use, it
  3526. // cannot be demoted.
  3527. auto *I = dyn_cast<Instruction>(V);
  3528. if (!I || !I->hasOneUse() || !Expr.count(I))
  3529. return false;
  3530. switch (I->getOpcode()) {
  3531. // We can always demote truncations and extensions. Since truncations can
  3532. // seed additional demotion, we save the truncated value.
  3533. case Instruction::Trunc:
  3534. Roots.push_back(I->getOperand(0));
  3535. break;
  3536. case Instruction::ZExt:
  3537. case Instruction::SExt:
  3538. break;
  3539. // We can demote certain binary operations if we can demote both of their
  3540. // operands.
  3541. case Instruction::Add:
  3542. case Instruction::Sub:
  3543. case Instruction::Mul:
  3544. case Instruction::And:
  3545. case Instruction::Or:
  3546. case Instruction::Xor:
  3547. if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
  3548. !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
  3549. return false;
  3550. break;
  3551. // We can demote selects if we can demote their true and false values.
  3552. case Instruction::Select: {
  3553. SelectInst *SI = cast<SelectInst>(I);
  3554. if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
  3555. !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
  3556. return false;
  3557. break;
  3558. }
  3559. // We can demote phis if we can demote all their incoming operands. Note that
  3560. // we don't need to worry about cycles since we ensure single use above.
  3561. case Instruction::PHI: {
  3562. PHINode *PN = cast<PHINode>(I);
  3563. for (Value *IncValue : PN->incoming_values())
  3564. if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
  3565. return false;
  3566. break;
  3567. }
  3568. // Otherwise, conservatively give up.
  3569. default:
  3570. return false;
  3571. }
  3572. // Record the value that we can demote.
  3573. ToDemote.push_back(V);
  3574. return true;
  3575. }
  3576. void BoUpSLP::computeMinimumValueSizes() {
  3577. // If there are no external uses, the expression tree must be rooted by a
  3578. // store. We can't demote in-memory values, so there is nothing to do here.
  3579. if (ExternalUses.empty())
  3580. return;
  3581. // We only attempt to truncate integer expressions.
  3582. auto &TreeRoot = VectorizableTree[0].Scalars;
  3583. auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
  3584. if (!TreeRootIT)
  3585. return;
  3586. // If the expression is not rooted by a store, these roots should have
  3587. // external uses. We will rely on InstCombine to rewrite the expression in
  3588. // the narrower type. However, InstCombine only rewrites single-use values.
  3589. // This means that if a tree entry other than a root is used externally, it
  3590. // must have multiple uses and InstCombine will not rewrite it. The code
  3591. // below ensures that only the roots are used externally.
  3592. SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
  3593. for (auto &EU : ExternalUses)
  3594. if (!Expr.erase(EU.Scalar))
  3595. return;
  3596. if (!Expr.empty())
  3597. return;
  3598. // Collect the scalar values of the vectorizable expression. We will use this
  3599. // context to determine which values can be demoted. If we see a truncation,
  3600. // we mark it as seeding another demotion.
  3601. for (auto &Entry : VectorizableTree)
  3602. Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
  3603. // Ensure the roots of the vectorizable tree don't form a cycle. They must
  3604. // have a single external user that is not in the vectorizable tree.
  3605. for (auto *Root : TreeRoot)
  3606. if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
  3607. return;
  3608. // Conservatively determine if we can actually truncate the roots of the
  3609. // expression. Collect the values that can be demoted in ToDemote and
  3610. // additional roots that require investigating in Roots.
  3611. SmallVector<Value *, 32> ToDemote;
  3612. SmallVector<Value *, 4> Roots;
  3613. for (auto *Root : TreeRoot)
  3614. if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
  3615. return;
  3616. // The maximum bit width required to represent all the values that can be
  3617. // demoted without loss of precision. It would be safe to truncate the roots
  3618. // of the expression to this width.
  3619. auto MaxBitWidth = 8u;
  3620. // We first check if all the bits of the roots are demanded. If they're not,
  3621. // we can truncate the roots to this narrower type.
  3622. for (auto *Root : TreeRoot) {
  3623. auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
  3624. MaxBitWidth = std::max<unsigned>(
  3625. Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
  3626. }
  3627. // True if the roots can be zero-extended back to their original type, rather
  3628. // than sign-extended. We know that if the leading bits are not demanded, we
  3629. // can safely zero-extend. So we initialize IsKnownPositive to True.
  3630. bool IsKnownPositive = true;
  3631. // If all the bits of the roots are demanded, we can try a little harder to
  3632. // compute a narrower type. This can happen, for example, if the roots are
  3633. // getelementptr indices. InstCombine promotes these indices to the pointer
  3634. // width. Thus, all their bits are technically demanded even though the
  3635. // address computation might be vectorized in a smaller type.
  3636. //
  3637. // We start by looking at each entry that can be demoted. We compute the
  3638. // maximum bit width required to store the scalar by using ValueTracking to
  3639. // compute the number of high-order bits we can truncate.
  3640. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
  3641. MaxBitWidth = 8u;
  3642. // Determine if the sign bit of all the roots is known to be zero. If not,
  3643. // IsKnownPositive is set to False.
  3644. IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
  3645. KnownBits Known = computeKnownBits(R, *DL);
  3646. return Known.isNonNegative();
  3647. });
  3648. // Determine the maximum number of bits required to store the scalar
  3649. // values.
  3650. for (auto *Scalar : ToDemote) {
  3651. auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
  3652. auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
  3653. MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
  3654. }
  3655. // If we can't prove that the sign bit is zero, we must add one to the
  3656. // maximum bit width to account for the unknown sign bit. This preserves
  3657. // the existing sign bit so we can safely sign-extend the root back to the
  3658. // original type. Otherwise, if we know the sign bit is zero, we will
  3659. // zero-extend the root instead.
  3660. //
  3661. // FIXME: This is somewhat suboptimal, as there will be cases where adding
  3662. // one to the maximum bit width will yield a larger-than-necessary
  3663. // type. In general, we need to add an extra bit only if we can't
  3664. // prove that the upper bit of the original type is equal to the
  3665. // upper bit of the proposed smaller type. If these two bits are the
  3666. // same (either zero or one) we know that sign-extending from the
  3667. // smaller type will result in the same value. Here, since we can't
  3668. // yet prove this, we are just making the proposed smaller type
  3669. // larger to ensure correctness.
  3670. if (!IsKnownPositive)
  3671. ++MaxBitWidth;
  3672. }
  3673. // Round MaxBitWidth up to the next power-of-two.
  3674. if (!isPowerOf2_64(MaxBitWidth))
  3675. MaxBitWidth = NextPowerOf2(MaxBitWidth);
  3676. // If the maximum bit width we compute is less than the with of the roots'
  3677. // type, we can proceed with the narrowing. Otherwise, do nothing.
  3678. if (MaxBitWidth >= TreeRootIT->getBitWidth())
  3679. return;
  3680. // If we can truncate the root, we must collect additional values that might
  3681. // be demoted as a result. That is, those seeded by truncations we will
  3682. // modify.
  3683. while (!Roots.empty())
  3684. collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
  3685. // Finally, map the values we can demote to the maximum bit with we computed.
  3686. for (auto *Scalar : ToDemote)
  3687. MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
  3688. }
  3689. namespace {
  3690. /// The SLPVectorizer Pass.
  3691. struct SLPVectorizer : public FunctionPass {
  3692. SLPVectorizerPass Impl;
  3693. /// Pass identification, replacement for typeid
  3694. static char ID;
  3695. explicit SLPVectorizer() : FunctionPass(ID) {
  3696. initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
  3697. }
  3698. bool doInitialization(Module &M) override {
  3699. return false;
  3700. }
  3701. bool runOnFunction(Function &F) override {
  3702. if (skipFunction(F))
  3703. return false;
  3704. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  3705. auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  3706. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  3707. auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
  3708. auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  3709. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  3710. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  3711. auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  3712. auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
  3713. auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  3714. return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
  3715. }
  3716. void getAnalysisUsage(AnalysisUsage &AU) const override {
  3717. FunctionPass::getAnalysisUsage(AU);
  3718. AU.addRequired<AssumptionCacheTracker>();
  3719. AU.addRequired<ScalarEvolutionWrapperPass>();
  3720. AU.addRequired<AAResultsWrapperPass>();
  3721. AU.addRequired<TargetTransformInfoWrapperPass>();
  3722. AU.addRequired<LoopInfoWrapperPass>();
  3723. AU.addRequired<DominatorTreeWrapperPass>();
  3724. AU.addRequired<DemandedBitsWrapperPass>();
  3725. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  3726. AU.addPreserved<LoopInfoWrapperPass>();
  3727. AU.addPreserved<DominatorTreeWrapperPass>();
  3728. AU.addPreserved<AAResultsWrapperPass>();
  3729. AU.addPreserved<GlobalsAAWrapperPass>();
  3730. AU.setPreservesCFG();
  3731. }
  3732. };
  3733. } // end anonymous namespace
  3734. PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
  3735. auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
  3736. auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
  3737. auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
  3738. auto *AA = &AM.getResult<AAManager>(F);
  3739. auto *LI = &AM.getResult<LoopAnalysis>(F);
  3740. auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
  3741. auto *AC = &AM.getResult<AssumptionAnalysis>(F);
  3742. auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
  3743. auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  3744. bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
  3745. if (!Changed)
  3746. return PreservedAnalyses::all();
  3747. PreservedAnalyses PA;
  3748. PA.preserveSet<CFGAnalyses>();
  3749. PA.preserve<AAManager>();
  3750. PA.preserve<GlobalsAA>();
  3751. return PA;
  3752. }
  3753. bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
  3754. TargetTransformInfo *TTI_,
  3755. TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
  3756. LoopInfo *LI_, DominatorTree *DT_,
  3757. AssumptionCache *AC_, DemandedBits *DB_,
  3758. OptimizationRemarkEmitter *ORE_) {
  3759. SE = SE_;
  3760. TTI = TTI_;
  3761. TLI = TLI_;
  3762. AA = AA_;
  3763. LI = LI_;
  3764. DT = DT_;
  3765. AC = AC_;
  3766. DB = DB_;
  3767. DL = &F.getParent()->getDataLayout();
  3768. Stores.clear();
  3769. GEPs.clear();
  3770. bool Changed = false;
  3771. // If the target claims to have no vector registers don't attempt
  3772. // vectorization.
  3773. if (!TTI->getNumberOfRegisters(true))
  3774. return false;
  3775. // Don't vectorize when the attribute NoImplicitFloat is used.
  3776. if (F.hasFnAttribute(Attribute::NoImplicitFloat))
  3777. return false;
  3778. DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
  3779. // Use the bottom up slp vectorizer to construct chains that start with
  3780. // store instructions.
  3781. BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
  3782. // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
  3783. // delete instructions.
  3784. // Scan the blocks in the function in post order.
  3785. for (auto BB : post_order(&F.getEntryBlock())) {
  3786. collectSeedInstructions(BB);
  3787. // Vectorize trees that end at stores.
  3788. if (!Stores.empty()) {
  3789. DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
  3790. << " underlying objects.\n");
  3791. Changed |= vectorizeStoreChains(R);
  3792. }
  3793. // Vectorize trees that end at reductions.
  3794. Changed |= vectorizeChainsInBlock(BB, R);
  3795. // Vectorize the index computations of getelementptr instructions. This
  3796. // is primarily intended to catch gather-like idioms ending at
  3797. // non-consecutive loads.
  3798. if (!GEPs.empty()) {
  3799. DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
  3800. << " underlying objects.\n");
  3801. Changed |= vectorizeGEPIndices(BB, R);
  3802. }
  3803. }
  3804. if (Changed) {
  3805. R.optimizeGatherSequence(F);
  3806. DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
  3807. DEBUG(verifyFunction(F));
  3808. }
  3809. return Changed;
  3810. }
  3811. /// \brief Check that the Values in the slice in VL array are still existent in
  3812. /// the WeakTrackingVH array.
  3813. /// Vectorization of part of the VL array may cause later values in the VL array
  3814. /// to become invalid. We track when this has happened in the WeakTrackingVH
  3815. /// array.
  3816. static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
  3817. ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
  3818. unsigned SliceSize) {
  3819. VL = VL.slice(SliceBegin, SliceSize);
  3820. VH = VH.slice(SliceBegin, SliceSize);
  3821. return !std::equal(VL.begin(), VL.end(), VH.begin());
  3822. }
  3823. bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
  3824. unsigned VecRegSize) {
  3825. unsigned ChainLen = Chain.size();
  3826. DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
  3827. << "\n");
  3828. unsigned Sz = R.getVectorElementSize(Chain[0]);
  3829. unsigned VF = VecRegSize / Sz;
  3830. if (!isPowerOf2_32(Sz) || VF < 2)
  3831. return false;
  3832. // Keep track of values that were deleted by vectorizing in the loop below.
  3833. SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
  3834. bool Changed = false;
  3835. // Look for profitable vectorizable trees at all offsets, starting at zero.
  3836. for (unsigned i = 0, e = ChainLen; i < e; ++i) {
  3837. if (i + VF > e)
  3838. break;
  3839. // Check that a previous iteration of this loop did not delete the Value.
  3840. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
  3841. continue;
  3842. DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
  3843. << "\n");
  3844. ArrayRef<Value *> Operands = Chain.slice(i, VF);
  3845. R.buildTree(Operands);
  3846. if (R.isTreeTinyAndNotFullyVectorizable())
  3847. continue;
  3848. R.computeMinimumValueSizes();
  3849. int Cost = R.getTreeCost();
  3850. DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
  3851. if (Cost < -SLPCostThreshold) {
  3852. DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
  3853. using namespace ore;
  3854. R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
  3855. cast<StoreInst>(Chain[i]))
  3856. << "Stores SLP vectorized with cost " << NV("Cost", Cost)
  3857. << " and with tree size "
  3858. << NV("TreeSize", R.getTreeSize()));
  3859. R.vectorizeTree();
  3860. // Move to the next bundle.
  3861. i += VF - 1;
  3862. Changed = true;
  3863. }
  3864. }
  3865. return Changed;
  3866. }
  3867. bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
  3868. BoUpSLP &R) {
  3869. SetVector<StoreInst *> Heads;
  3870. SmallDenseSet<StoreInst *> Tails;
  3871. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  3872. // We may run into multiple chains that merge into a single chain. We mark the
  3873. // stores that we vectorized so that we don't visit the same store twice.
  3874. BoUpSLP::ValueSet VectorizedStores;
  3875. bool Changed = false;
  3876. // Do a quadratic search on all of the given stores in reverse order and find
  3877. // all of the pairs of stores that follow each other.
  3878. SmallVector<unsigned, 16> IndexQueue;
  3879. unsigned E = Stores.size();
  3880. IndexQueue.resize(E - 1);
  3881. for (unsigned I = E; I > 0; --I) {
  3882. unsigned Idx = I - 1;
  3883. // If a store has multiple consecutive store candidates, search Stores
  3884. // array according to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
  3885. // This is because usually pairing with immediate succeeding or preceding
  3886. // candidate create the best chance to find slp vectorization opportunity.
  3887. unsigned Offset = 1;
  3888. unsigned Cnt = 0;
  3889. for (unsigned J = 0; J < E - 1; ++J, ++Offset) {
  3890. if (Idx >= Offset) {
  3891. IndexQueue[Cnt] = Idx - Offset;
  3892. ++Cnt;
  3893. }
  3894. if (Idx + Offset < E) {
  3895. IndexQueue[Cnt] = Idx + Offset;
  3896. ++Cnt;
  3897. }
  3898. }
  3899. for (auto K : IndexQueue) {
  3900. if (isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE)) {
  3901. Tails.insert(Stores[Idx]);
  3902. Heads.insert(Stores[K]);
  3903. ConsecutiveChain[Stores[K]] = Stores[Idx];
  3904. break;
  3905. }
  3906. }
  3907. }
  3908. // For stores that start but don't end a link in the chain:
  3909. for (auto *SI : llvm::reverse(Heads)) {
  3910. if (Tails.count(SI))
  3911. continue;
  3912. // We found a store instr that starts a chain. Now follow the chain and try
  3913. // to vectorize it.
  3914. BoUpSLP::ValueList Operands;
  3915. StoreInst *I = SI;
  3916. // Collect the chain into a list.
  3917. while ((Tails.count(I) || Heads.count(I)) && !VectorizedStores.count(I)) {
  3918. Operands.push_back(I);
  3919. // Move to the next value in the chain.
  3920. I = ConsecutiveChain[I];
  3921. }
  3922. // FIXME: Is division-by-2 the correct step? Should we assert that the
  3923. // register size is a power-of-2?
  3924. for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
  3925. Size /= 2) {
  3926. if (vectorizeStoreChain(Operands, R, Size)) {
  3927. // Mark the vectorized stores so that we don't vectorize them again.
  3928. VectorizedStores.insert(Operands.begin(), Operands.end());
  3929. Changed = true;
  3930. break;
  3931. }
  3932. }
  3933. }
  3934. return Changed;
  3935. }
  3936. void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
  3937. // Initialize the collections. We will make a single pass over the block.
  3938. Stores.clear();
  3939. GEPs.clear();
  3940. // Visit the store and getelementptr instructions in BB and organize them in
  3941. // Stores and GEPs according to the underlying objects of their pointer
  3942. // operands.
  3943. for (Instruction &I : *BB) {
  3944. // Ignore store instructions that are volatile or have a pointer operand
  3945. // that doesn't point to a scalar type.
  3946. if (auto *SI = dyn_cast<StoreInst>(&I)) {
  3947. if (!SI->isSimple())
  3948. continue;
  3949. if (!isValidElementType(SI->getValueOperand()->getType()))
  3950. continue;
  3951. Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
  3952. }
  3953. // Ignore getelementptr instructions that have more than one index, a
  3954. // constant index, or a pointer operand that doesn't point to a scalar
  3955. // type.
  3956. else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
  3957. auto Idx = GEP->idx_begin()->get();
  3958. if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
  3959. continue;
  3960. if (!isValidElementType(Idx->getType()))
  3961. continue;
  3962. if (GEP->getType()->isVectorTy())
  3963. continue;
  3964. GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
  3965. }
  3966. }
  3967. }
  3968. bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  3969. if (!A || !B)
  3970. return false;
  3971. Value *VL[] = { A, B };
  3972. return tryToVectorizeList(VL, R, None, true);
  3973. }
  3974. bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  3975. ArrayRef<Value *> BuildVector,
  3976. bool AllowReorder,
  3977. bool NeedExtraction) {
  3978. if (VL.size() < 2)
  3979. return false;
  3980. DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
  3981. << ".\n");
  3982. // Check that all of the parts are scalar instructions of the same type.
  3983. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  3984. if (!I0)
  3985. return false;
  3986. unsigned Opcode0 = I0->getOpcode();
  3987. unsigned Sz = R.getVectorElementSize(I0);
  3988. unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
  3989. unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
  3990. if (MaxVF < 2) {
  3991. R.getORE()->emit([&]() {
  3992. return OptimizationRemarkMissed(
  3993. SV_NAME, "SmallVF", I0)
  3994. << "Cannot SLP vectorize list: vectorization factor "
  3995. << "less than 2 is not supported";
  3996. });
  3997. return false;
  3998. }
  3999. for (Value *V : VL) {
  4000. Type *Ty = V->getType();
  4001. if (!isValidElementType(Ty)) {
  4002. // NOTE: the following will give user internal llvm type name, which may not be useful
  4003. R.getORE()->emit([&]() {
  4004. std::string type_str;
  4005. llvm::raw_string_ostream rso(type_str);
  4006. Ty->print(rso);
  4007. return OptimizationRemarkMissed(
  4008. SV_NAME, "UnsupportedType", I0)
  4009. << "Cannot SLP vectorize list: type "
  4010. << rso.str() + " is unsupported by vectorizer";
  4011. });
  4012. return false;
  4013. }
  4014. Instruction *Inst = dyn_cast<Instruction>(V);
  4015. if (!Inst)
  4016. return false;
  4017. if (Inst->getOpcode() != Opcode0) {
  4018. R.getORE()->emit([&]() {
  4019. return OptimizationRemarkMissed(
  4020. SV_NAME, "InequableTypes", I0)
  4021. << "Cannot SLP vectorize list: not all of the "
  4022. << "parts of scalar instructions are of the same type: "
  4023. << ore::NV("Instruction1Opcode", I0) << " and "
  4024. << ore::NV("Instruction2Opcode", Inst);
  4025. });
  4026. return false;
  4027. }
  4028. }
  4029. bool Changed = false;
  4030. bool CandidateFound = false;
  4031. int MinCost = SLPCostThreshold;
  4032. // Keep track of values that were deleted by vectorizing in the loop below.
  4033. SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
  4034. unsigned NextInst = 0, MaxInst = VL.size();
  4035. for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
  4036. VF /= 2) {
  4037. // No actual vectorization should happen, if number of parts is the same as
  4038. // provided vectorization factor (i.e. the scalar type is used for vector
  4039. // code during codegen).
  4040. auto *VecTy = VectorType::get(VL[0]->getType(), VF);
  4041. if (TTI->getNumberOfParts(VecTy) == VF)
  4042. continue;
  4043. for (unsigned I = NextInst; I < MaxInst; ++I) {
  4044. unsigned OpsWidth = 0;
  4045. if (I + VF > MaxInst)
  4046. OpsWidth = MaxInst - I;
  4047. else
  4048. OpsWidth = VF;
  4049. if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
  4050. break;
  4051. // Check that a previous iteration of this loop did not delete the Value.
  4052. if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
  4053. continue;
  4054. DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
  4055. << "\n");
  4056. ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
  4057. ArrayRef<Value *> EmptyArray;
  4058. ArrayRef<Value *> BuildVectorSlice;
  4059. if (!BuildVector.empty())
  4060. BuildVectorSlice = BuildVector.slice(I, OpsWidth);
  4061. R.buildTree(Ops, NeedExtraction ? EmptyArray : BuildVectorSlice);
  4062. // TODO: check if we can allow reordering for more cases.
  4063. if (AllowReorder && R.shouldReorder()) {
  4064. // Conceptually, there is nothing actually preventing us from trying to
  4065. // reorder a larger list. In fact, we do exactly this when vectorizing
  4066. // reductions. However, at this point, we only expect to get here when
  4067. // there are exactly two operations.
  4068. assert(Ops.size() == 2);
  4069. assert(BuildVectorSlice.empty());
  4070. Value *ReorderedOps[] = {Ops[1], Ops[0]};
  4071. R.buildTree(ReorderedOps, None);
  4072. }
  4073. if (R.isTreeTinyAndNotFullyVectorizable())
  4074. continue;
  4075. R.computeMinimumValueSizes();
  4076. int Cost = R.getTreeCost();
  4077. CandidateFound = true;
  4078. MinCost = std::min(MinCost, Cost);
  4079. if (Cost < -SLPCostThreshold) {
  4080. DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
  4081. R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
  4082. cast<Instruction>(Ops[0]))
  4083. << "SLP vectorized with cost " << ore::NV("Cost", Cost)
  4084. << " and with tree size "
  4085. << ore::NV("TreeSize", R.getTreeSize()));
  4086. Value *VectorizedRoot = R.vectorizeTree();
  4087. // Reconstruct the build vector by extracting the vectorized root. This
  4088. // way we handle the case where some elements of the vector are
  4089. // undefined.
  4090. // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
  4091. if (!BuildVectorSlice.empty()) {
  4092. // The insert point is the last build vector instruction. The
  4093. // vectorized root will precede it. This guarantees that we get an
  4094. // instruction. The vectorized tree could have been constant folded.
  4095. Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
  4096. unsigned VecIdx = 0;
  4097. for (auto &V : BuildVectorSlice) {
  4098. IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
  4099. ++BasicBlock::iterator(InsertAfter));
  4100. Instruction *I = cast<Instruction>(V);
  4101. assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
  4102. Instruction *Extract =
  4103. cast<Instruction>(Builder.CreateExtractElement(
  4104. VectorizedRoot, Builder.getInt32(VecIdx++)));
  4105. I->setOperand(1, Extract);
  4106. I->moveAfter(Extract);
  4107. InsertAfter = I;
  4108. }
  4109. }
  4110. // Move to the next bundle.
  4111. I += VF - 1;
  4112. NextInst = I + 1;
  4113. Changed = true;
  4114. }
  4115. }
  4116. }
  4117. if (!Changed && CandidateFound) {
  4118. R.getORE()->emit([&]() {
  4119. return OptimizationRemarkMissed(
  4120. SV_NAME, "NotBeneficial", I0)
  4121. << "List vectorization was possible but not beneficial with cost "
  4122. << ore::NV("Cost", MinCost) << " >= "
  4123. << ore::NV("Treshold", -SLPCostThreshold);
  4124. });
  4125. } else if (!Changed) {
  4126. R.getORE()->emit([&]() {
  4127. return OptimizationRemarkMissed(
  4128. SV_NAME, "NotPossible", I0)
  4129. << "Cannot SLP vectorize list: vectorization was impossible"
  4130. << " with available vectorization factors";
  4131. });
  4132. }
  4133. return Changed;
  4134. }
  4135. bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
  4136. if (!I)
  4137. return false;
  4138. if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
  4139. return false;
  4140. Value *P = I->getParent();
  4141. // Vectorize in current basic block only.
  4142. auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
  4143. auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
  4144. if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
  4145. return false;
  4146. // Try to vectorize V.
  4147. if (tryToVectorizePair(Op0, Op1, R))
  4148. return true;
  4149. auto *A = dyn_cast<BinaryOperator>(Op0);
  4150. auto *B = dyn_cast<BinaryOperator>(Op1);
  4151. // Try to skip B.
  4152. if (B && B->hasOneUse()) {
  4153. auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
  4154. auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
  4155. if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
  4156. return true;
  4157. if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
  4158. return true;
  4159. }
  4160. // Try to skip A.
  4161. if (A && A->hasOneUse()) {
  4162. auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
  4163. auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
  4164. if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
  4165. return true;
  4166. if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
  4167. return true;
  4168. }
  4169. return false;
  4170. }
  4171. /// \brief Generate a shuffle mask to be used in a reduction tree.
  4172. ///
  4173. /// \param VecLen The length of the vector to be reduced.
  4174. /// \param NumEltsToRdx The number of elements that should be reduced in the
  4175. /// vector.
  4176. /// \param IsPairwise Whether the reduction is a pairwise or splitting
  4177. /// reduction. A pairwise reduction will generate a mask of
  4178. /// <0,2,...> or <1,3,..> while a splitting reduction will generate
  4179. /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
  4180. /// \param IsLeft True will generate a mask of even elements, odd otherwise.
  4181. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
  4182. bool IsPairwise, bool IsLeft,
  4183. IRBuilder<> &Builder) {
  4184. assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
  4185. SmallVector<Constant *, 32> ShuffleMask(
  4186. VecLen, UndefValue::get(Builder.getInt32Ty()));
  4187. if (IsPairwise)
  4188. // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
  4189. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  4190. ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
  4191. else
  4192. // Move the upper half of the vector to the lower half.
  4193. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  4194. ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
  4195. return ConstantVector::get(ShuffleMask);
  4196. }
  4197. namespace {
  4198. /// Model horizontal reductions.
  4199. ///
  4200. /// A horizontal reduction is a tree of reduction operations (currently add and
  4201. /// fadd) that has operations that can be put into a vector as its leaf.
  4202. /// For example, this tree:
  4203. ///
  4204. /// mul mul mul mul
  4205. /// \ / \ /
  4206. /// + +
  4207. /// \ /
  4208. /// +
  4209. /// This tree has "mul" as its reduced values and "+" as its reduction
  4210. /// operations. A reduction might be feeding into a store or a binary operation
  4211. /// feeding a phi.
  4212. /// ...
  4213. /// \ /
  4214. /// +
  4215. /// |
  4216. /// phi +=
  4217. ///
  4218. /// Or:
  4219. /// ...
  4220. /// \ /
  4221. /// +
  4222. /// |
  4223. /// *p =
  4224. ///
  4225. class HorizontalReduction {
  4226. using ReductionOpsType = SmallVector<Value *, 16>;
  4227. using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
  4228. ReductionOpsListType ReductionOps;
  4229. SmallVector<Value *, 32> ReducedVals;
  4230. // Use map vector to make stable output.
  4231. MapVector<Instruction *, Value *> ExtraArgs;
  4232. /// Kind of the reduction data.
  4233. enum ReductionKind {
  4234. RK_None, /// Not a reduction.
  4235. RK_Arithmetic, /// Binary reduction data.
  4236. RK_Min, /// Minimum reduction data.
  4237. RK_UMin, /// Unsigned minimum reduction data.
  4238. RK_Max, /// Maximum reduction data.
  4239. RK_UMax, /// Unsigned maximum reduction data.
  4240. };
  4241. /// Contains info about operation, like its opcode, left and right operands.
  4242. class OperationData {
  4243. /// Opcode of the instruction.
  4244. unsigned Opcode = 0;
  4245. /// Left operand of the reduction operation.
  4246. Value *LHS = nullptr;
  4247. /// Right operand of the reduction operation.
  4248. Value *RHS = nullptr;
  4249. /// Kind of the reduction operation.
  4250. ReductionKind Kind = RK_None;
  4251. /// True if float point min/max reduction has no NaNs.
  4252. bool NoNaN = false;
  4253. /// Checks if the reduction operation can be vectorized.
  4254. bool isVectorizable() const {
  4255. return LHS && RHS &&
  4256. // We currently only support adds && min/max reductions.
  4257. ((Kind == RK_Arithmetic &&
  4258. (Opcode == Instruction::Add || Opcode == Instruction::FAdd)) ||
  4259. ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
  4260. (Kind == RK_Min || Kind == RK_Max)) ||
  4261. (Opcode == Instruction::ICmp &&
  4262. (Kind == RK_UMin || Kind == RK_UMax)));
  4263. }
  4264. /// Creates reduction operation with the current opcode.
  4265. Value *createOp(IRBuilder<> &Builder, const Twine &Name) const {
  4266. assert(isVectorizable() &&
  4267. "Expected add|fadd or min/max reduction operation.");
  4268. Value *Cmp;
  4269. switch (Kind) {
  4270. case RK_Arithmetic:
  4271. return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, LHS, RHS,
  4272. Name);
  4273. case RK_Min:
  4274. Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSLT(LHS, RHS)
  4275. : Builder.CreateFCmpOLT(LHS, RHS);
  4276. break;
  4277. case RK_Max:
  4278. Cmp = Opcode == Instruction::ICmp ? Builder.CreateICmpSGT(LHS, RHS)
  4279. : Builder.CreateFCmpOGT(LHS, RHS);
  4280. break;
  4281. case RK_UMin:
  4282. assert(Opcode == Instruction::ICmp && "Expected integer types.");
  4283. Cmp = Builder.CreateICmpULT(LHS, RHS);
  4284. break;
  4285. case RK_UMax:
  4286. assert(Opcode == Instruction::ICmp && "Expected integer types.");
  4287. Cmp = Builder.CreateICmpUGT(LHS, RHS);
  4288. break;
  4289. case RK_None:
  4290. llvm_unreachable("Unknown reduction operation.");
  4291. }
  4292. return Builder.CreateSelect(Cmp, LHS, RHS, Name);
  4293. }
  4294. public:
  4295. explicit OperationData() = default;
  4296. /// Construction for reduced values. They are identified by opcode only and
  4297. /// don't have associated LHS/RHS values.
  4298. explicit OperationData(Value *V) {
  4299. if (auto *I = dyn_cast<Instruction>(V))
  4300. Opcode = I->getOpcode();
  4301. }
  4302. /// Constructor for reduction operations with opcode and its left and
  4303. /// right operands.
  4304. OperationData(unsigned Opcode, Value *LHS, Value *RHS, ReductionKind Kind,
  4305. bool NoNaN = false)
  4306. : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind), NoNaN(NoNaN) {
  4307. assert(Kind != RK_None && "One of the reduction operations is expected.");
  4308. }
  4309. explicit operator bool() const { return Opcode; }
  4310. /// Get the index of the first operand.
  4311. unsigned getFirstOperandIndex() const {
  4312. assert(!!*this && "The opcode is not set.");
  4313. switch (Kind) {
  4314. case RK_Min:
  4315. case RK_UMin:
  4316. case RK_Max:
  4317. case RK_UMax:
  4318. return 1;
  4319. case RK_Arithmetic:
  4320. case RK_None:
  4321. break;
  4322. }
  4323. return 0;
  4324. }
  4325. /// Total number of operands in the reduction operation.
  4326. unsigned getNumberOfOperands() const {
  4327. assert(Kind != RK_None && !!*this && LHS && RHS &&
  4328. "Expected reduction operation.");
  4329. switch (Kind) {
  4330. case RK_Arithmetic:
  4331. return 2;
  4332. case RK_Min:
  4333. case RK_UMin:
  4334. case RK_Max:
  4335. case RK_UMax:
  4336. return 3;
  4337. case RK_None:
  4338. break;
  4339. }
  4340. llvm_unreachable("Reduction kind is not set");
  4341. }
  4342. /// Checks if the operation has the same parent as \p P.
  4343. bool hasSameParent(Instruction *I, Value *P, bool IsRedOp) const {
  4344. assert(Kind != RK_None && !!*this && LHS && RHS &&
  4345. "Expected reduction operation.");
  4346. if (!IsRedOp)
  4347. return I->getParent() == P;
  4348. switch (Kind) {
  4349. case RK_Arithmetic:
  4350. // Arithmetic reduction operation must be used once only.
  4351. return I->getParent() == P;
  4352. case RK_Min:
  4353. case RK_UMin:
  4354. case RK_Max:
  4355. case RK_UMax: {
  4356. // SelectInst must be used twice while the condition op must have single
  4357. // use only.
  4358. auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
  4359. return I->getParent() == P && Cmp && Cmp->getParent() == P;
  4360. }
  4361. case RK_None:
  4362. break;
  4363. }
  4364. llvm_unreachable("Reduction kind is not set");
  4365. }
  4366. /// Expected number of uses for reduction operations/reduced values.
  4367. bool hasRequiredNumberOfUses(Instruction *I, bool IsReductionOp) const {
  4368. assert(Kind != RK_None && !!*this && LHS && RHS &&
  4369. "Expected reduction operation.");
  4370. switch (Kind) {
  4371. case RK_Arithmetic:
  4372. return I->hasOneUse();
  4373. case RK_Min:
  4374. case RK_UMin:
  4375. case RK_Max:
  4376. case RK_UMax:
  4377. return I->hasNUses(2) &&
  4378. (!IsReductionOp ||
  4379. cast<SelectInst>(I)->getCondition()->hasOneUse());
  4380. case RK_None:
  4381. break;
  4382. }
  4383. llvm_unreachable("Reduction kind is not set");
  4384. }
  4385. /// Initializes the list of reduction operations.
  4386. void initReductionOps(ReductionOpsListType &ReductionOps) {
  4387. assert(Kind != RK_None && !!*this && LHS && RHS &&
  4388. "Expected reduction operation.");
  4389. switch (Kind) {
  4390. case RK_Arithmetic:
  4391. ReductionOps.assign(1, ReductionOpsType());
  4392. break;
  4393. case RK_Min:
  4394. case RK_UMin:
  4395. case RK_Max:
  4396. case RK_UMax:
  4397. ReductionOps.assign(2, ReductionOpsType());
  4398. break;
  4399. case RK_None:
  4400. llvm_unreachable("Reduction kind is not set");
  4401. }
  4402. }
  4403. /// Add all reduction operations for the reduction instruction \p I.
  4404. void addReductionOps(Instruction *I, ReductionOpsListType &ReductionOps) {
  4405. assert(Kind != RK_None && !!*this && LHS && RHS &&
  4406. "Expected reduction operation.");
  4407. switch (Kind) {
  4408. case RK_Arithmetic:
  4409. ReductionOps[0].emplace_back(I);
  4410. break;
  4411. case RK_Min:
  4412. case RK_UMin:
  4413. case RK_Max:
  4414. case RK_UMax:
  4415. ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
  4416. ReductionOps[1].emplace_back(I);
  4417. break;
  4418. case RK_None:
  4419. llvm_unreachable("Reduction kind is not set");
  4420. }
  4421. }
  4422. /// Checks if instruction is associative and can be vectorized.
  4423. bool isAssociative(Instruction *I) const {
  4424. assert(Kind != RK_None && *this && LHS && RHS &&
  4425. "Expected reduction operation.");
  4426. switch (Kind) {
  4427. case RK_Arithmetic:
  4428. return I->isAssociative();
  4429. case RK_Min:
  4430. case RK_Max:
  4431. return Opcode == Instruction::ICmp ||
  4432. cast<Instruction>(I->getOperand(0))->isFast();
  4433. case RK_UMin:
  4434. case RK_UMax:
  4435. assert(Opcode == Instruction::ICmp &&
  4436. "Only integer compare operation is expected.");
  4437. return true;
  4438. case RK_None:
  4439. break;
  4440. }
  4441. llvm_unreachable("Reduction kind is not set");
  4442. }
  4443. /// Checks if the reduction operation can be vectorized.
  4444. bool isVectorizable(Instruction *I) const {
  4445. return isVectorizable() && isAssociative(I);
  4446. }
  4447. /// Checks if two operation data are both a reduction op or both a reduced
  4448. /// value.
  4449. bool operator==(const OperationData &OD) {
  4450. assert(((Kind != OD.Kind) || ((!LHS == !OD.LHS) && (!RHS == !OD.RHS))) &&
  4451. "One of the comparing operations is incorrect.");
  4452. return this == &OD || (Kind == OD.Kind && Opcode == OD.Opcode);
  4453. }
  4454. bool operator!=(const OperationData &OD) { return !(*this == OD); }
  4455. void clear() {
  4456. Opcode = 0;
  4457. LHS = nullptr;
  4458. RHS = nullptr;
  4459. Kind = RK_None;
  4460. NoNaN = false;
  4461. }
  4462. /// Get the opcode of the reduction operation.
  4463. unsigned getOpcode() const {
  4464. assert(isVectorizable() && "Expected vectorizable operation.");
  4465. return Opcode;
  4466. }
  4467. /// Get kind of reduction data.
  4468. ReductionKind getKind() const { return Kind; }
  4469. Value *getLHS() const { return LHS; }
  4470. Value *getRHS() const { return RHS; }
  4471. Type *getConditionType() const {
  4472. switch (Kind) {
  4473. case RK_Arithmetic:
  4474. return nullptr;
  4475. case RK_Min:
  4476. case RK_Max:
  4477. case RK_UMin:
  4478. case RK_UMax:
  4479. return CmpInst::makeCmpResultType(LHS->getType());
  4480. case RK_None:
  4481. break;
  4482. }
  4483. llvm_unreachable("Reduction kind is not set");
  4484. }
  4485. /// Creates reduction operation with the current opcode with the IR flags
  4486. /// from \p ReductionOps.
  4487. Value *createOp(IRBuilder<> &Builder, const Twine &Name,
  4488. const ReductionOpsListType &ReductionOps) const {
  4489. assert(isVectorizable() &&
  4490. "Expected add|fadd or min/max reduction operation.");
  4491. auto *Op = createOp(Builder, Name);
  4492. switch (Kind) {
  4493. case RK_Arithmetic:
  4494. propagateIRFlags(Op, ReductionOps[0]);
  4495. return Op;
  4496. case RK_Min:
  4497. case RK_Max:
  4498. case RK_UMin:
  4499. case RK_UMax:
  4500. if (auto *SI = dyn_cast<SelectInst>(Op))
  4501. propagateIRFlags(SI->getCondition(), ReductionOps[0]);
  4502. propagateIRFlags(Op, ReductionOps[1]);
  4503. return Op;
  4504. case RK_None:
  4505. break;
  4506. }
  4507. llvm_unreachable("Unknown reduction operation.");
  4508. }
  4509. /// Creates reduction operation with the current opcode with the IR flags
  4510. /// from \p I.
  4511. Value *createOp(IRBuilder<> &Builder, const Twine &Name,
  4512. Instruction *I) const {
  4513. assert(isVectorizable() &&
  4514. "Expected add|fadd or min/max reduction operation.");
  4515. auto *Op = createOp(Builder, Name);
  4516. switch (Kind) {
  4517. case RK_Arithmetic:
  4518. propagateIRFlags(Op, I);
  4519. return Op;
  4520. case RK_Min:
  4521. case RK_Max:
  4522. case RK_UMin:
  4523. case RK_UMax:
  4524. if (auto *SI = dyn_cast<SelectInst>(Op)) {
  4525. propagateIRFlags(SI->getCondition(),
  4526. cast<SelectInst>(I)->getCondition());
  4527. }
  4528. propagateIRFlags(Op, I);
  4529. return Op;
  4530. case RK_None:
  4531. break;
  4532. }
  4533. llvm_unreachable("Unknown reduction operation.");
  4534. }
  4535. TargetTransformInfo::ReductionFlags getFlags() const {
  4536. TargetTransformInfo::ReductionFlags Flags;
  4537. Flags.NoNaN = NoNaN;
  4538. switch (Kind) {
  4539. case RK_Arithmetic:
  4540. break;
  4541. case RK_Min:
  4542. Flags.IsSigned = Opcode == Instruction::ICmp;
  4543. Flags.IsMaxOp = false;
  4544. break;
  4545. case RK_Max:
  4546. Flags.IsSigned = Opcode == Instruction::ICmp;
  4547. Flags.IsMaxOp = true;
  4548. break;
  4549. case RK_UMin:
  4550. Flags.IsSigned = false;
  4551. Flags.IsMaxOp = false;
  4552. break;
  4553. case RK_UMax:
  4554. Flags.IsSigned = false;
  4555. Flags.IsMaxOp = true;
  4556. break;
  4557. case RK_None:
  4558. llvm_unreachable("Reduction kind is not set");
  4559. }
  4560. return Flags;
  4561. }
  4562. };
  4563. Instruction *ReductionRoot = nullptr;
  4564. /// The operation data of the reduction operation.
  4565. OperationData ReductionData;
  4566. /// The operation data of the values we perform a reduction on.
  4567. OperationData ReducedValueData;
  4568. /// Should we model this reduction as a pairwise reduction tree or a tree that
  4569. /// splits the vector in halves and adds those halves.
  4570. bool IsPairwiseReduction = false;
  4571. /// Checks if the ParentStackElem.first should be marked as a reduction
  4572. /// operation with an extra argument or as extra argument itself.
  4573. void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
  4574. Value *ExtraArg) {
  4575. if (ExtraArgs.count(ParentStackElem.first)) {
  4576. ExtraArgs[ParentStackElem.first] = nullptr;
  4577. // We ran into something like:
  4578. // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
  4579. // The whole ParentStackElem.first should be considered as an extra value
  4580. // in this case.
  4581. // Do not perform analysis of remaining operands of ParentStackElem.first
  4582. // instruction, this whole instruction is an extra argument.
  4583. ParentStackElem.second = ParentStackElem.first->getNumOperands();
  4584. } else {
  4585. // We ran into something like:
  4586. // ParentStackElem.first += ... + ExtraArg + ...
  4587. ExtraArgs[ParentStackElem.first] = ExtraArg;
  4588. }
  4589. }
  4590. static OperationData getOperationData(Value *V) {
  4591. if (!V)
  4592. return OperationData();
  4593. Value *LHS;
  4594. Value *RHS;
  4595. if (m_BinOp(m_Value(LHS), m_Value(RHS)).match(V)) {
  4596. return OperationData(cast<BinaryOperator>(V)->getOpcode(), LHS, RHS,
  4597. RK_Arithmetic);
  4598. }
  4599. if (auto *Select = dyn_cast<SelectInst>(V)) {
  4600. // Look for a min/max pattern.
  4601. if (m_UMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4602. return OperationData(Instruction::ICmp, LHS, RHS, RK_UMin);
  4603. } else if (m_SMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4604. return OperationData(Instruction::ICmp, LHS, RHS, RK_Min);
  4605. } else if (m_OrdFMin(m_Value(LHS), m_Value(RHS)).match(Select) ||
  4606. m_UnordFMin(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4607. return OperationData(
  4608. Instruction::FCmp, LHS, RHS, RK_Min,
  4609. cast<Instruction>(Select->getCondition())->hasNoNaNs());
  4610. } else if (m_UMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4611. return OperationData(Instruction::ICmp, LHS, RHS, RK_UMax);
  4612. } else if (m_SMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4613. return OperationData(Instruction::ICmp, LHS, RHS, RK_Max);
  4614. } else if (m_OrdFMax(m_Value(LHS), m_Value(RHS)).match(Select) ||
  4615. m_UnordFMax(m_Value(LHS), m_Value(RHS)).match(Select)) {
  4616. return OperationData(
  4617. Instruction::FCmp, LHS, RHS, RK_Max,
  4618. cast<Instruction>(Select->getCondition())->hasNoNaNs());
  4619. }
  4620. }
  4621. return OperationData(V);
  4622. }
  4623. public:
  4624. HorizontalReduction() = default;
  4625. /// \brief Try to find a reduction tree.
  4626. bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
  4627. assert((!Phi || is_contained(Phi->operands(), B)) &&
  4628. "Thi phi needs to use the binary operator");
  4629. ReductionData = getOperationData(B);
  4630. // We could have a initial reductions that is not an add.
  4631. // r *= v1 + v2 + v3 + v4
  4632. // In such a case start looking for a tree rooted in the first '+'.
  4633. if (Phi) {
  4634. if (ReductionData.getLHS() == Phi) {
  4635. Phi = nullptr;
  4636. B = dyn_cast<Instruction>(ReductionData.getRHS());
  4637. ReductionData = getOperationData(B);
  4638. } else if (ReductionData.getRHS() == Phi) {
  4639. Phi = nullptr;
  4640. B = dyn_cast<Instruction>(ReductionData.getLHS());
  4641. ReductionData = getOperationData(B);
  4642. }
  4643. }
  4644. if (!ReductionData.isVectorizable(B))
  4645. return false;
  4646. Type *Ty = B->getType();
  4647. if (!isValidElementType(Ty))
  4648. return false;
  4649. ReducedValueData.clear();
  4650. ReductionRoot = B;
  4651. // Post order traverse the reduction tree starting at B. We only handle true
  4652. // trees containing only binary operators.
  4653. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
  4654. Stack.push_back(std::make_pair(B, ReductionData.getFirstOperandIndex()));
  4655. ReductionData.initReductionOps(ReductionOps);
  4656. while (!Stack.empty()) {
  4657. Instruction *TreeN = Stack.back().first;
  4658. unsigned EdgeToVist = Stack.back().second++;
  4659. OperationData OpData = getOperationData(TreeN);
  4660. bool IsReducedValue = OpData != ReductionData;
  4661. // Postorder vist.
  4662. if (IsReducedValue || EdgeToVist == OpData.getNumberOfOperands()) {
  4663. if (IsReducedValue)
  4664. ReducedVals.push_back(TreeN);
  4665. else {
  4666. auto I = ExtraArgs.find(TreeN);
  4667. if (I != ExtraArgs.end() && !I->second) {
  4668. // Check if TreeN is an extra argument of its parent operation.
  4669. if (Stack.size() <= 1) {
  4670. // TreeN can't be an extra argument as it is a root reduction
  4671. // operation.
  4672. return false;
  4673. }
  4674. // Yes, TreeN is an extra argument, do not add it to a list of
  4675. // reduction operations.
  4676. // Stack[Stack.size() - 2] always points to the parent operation.
  4677. markExtraArg(Stack[Stack.size() - 2], TreeN);
  4678. ExtraArgs.erase(TreeN);
  4679. } else
  4680. ReductionData.addReductionOps(TreeN, ReductionOps);
  4681. }
  4682. // Retract.
  4683. Stack.pop_back();
  4684. continue;
  4685. }
  4686. // Visit left or right.
  4687. Value *NextV = TreeN->getOperand(EdgeToVist);
  4688. if (NextV != Phi) {
  4689. auto *I = dyn_cast<Instruction>(NextV);
  4690. OpData = getOperationData(I);
  4691. // Continue analysis if the next operand is a reduction operation or
  4692. // (possibly) a reduced value. If the reduced value opcode is not set,
  4693. // the first met operation != reduction operation is considered as the
  4694. // reduced value class.
  4695. if (I && (!ReducedValueData || OpData == ReducedValueData ||
  4696. OpData == ReductionData)) {
  4697. const bool IsReductionOperation = OpData == ReductionData;
  4698. // Only handle trees in the current basic block.
  4699. if (!ReductionData.hasSameParent(I, B->getParent(),
  4700. IsReductionOperation)) {
  4701. // I is an extra argument for TreeN (its parent operation).
  4702. markExtraArg(Stack.back(), I);
  4703. continue;
  4704. }
  4705. // Each tree node needs to have minimal number of users except for the
  4706. // ultimate reduction.
  4707. if (!ReductionData.hasRequiredNumberOfUses(I,
  4708. OpData == ReductionData) &&
  4709. I != B) {
  4710. // I is an extra argument for TreeN (its parent operation).
  4711. markExtraArg(Stack.back(), I);
  4712. continue;
  4713. }
  4714. if (IsReductionOperation) {
  4715. // We need to be able to reassociate the reduction operations.
  4716. if (!OpData.isAssociative(I)) {
  4717. // I is an extra argument for TreeN (its parent operation).
  4718. markExtraArg(Stack.back(), I);
  4719. continue;
  4720. }
  4721. } else if (ReducedValueData &&
  4722. ReducedValueData != OpData) {
  4723. // Make sure that the opcodes of the operations that we are going to
  4724. // reduce match.
  4725. // I is an extra argument for TreeN (its parent operation).
  4726. markExtraArg(Stack.back(), I);
  4727. continue;
  4728. } else if (!ReducedValueData)
  4729. ReducedValueData = OpData;
  4730. Stack.push_back(std::make_pair(I, OpData.getFirstOperandIndex()));
  4731. continue;
  4732. }
  4733. }
  4734. // NextV is an extra argument for TreeN (its parent operation).
  4735. markExtraArg(Stack.back(), NextV);
  4736. }
  4737. return true;
  4738. }
  4739. /// \brief Attempt to vectorize the tree found by
  4740. /// matchAssociativeReduction.
  4741. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
  4742. if (ReducedVals.empty())
  4743. return false;
  4744. // If there is a sufficient number of reduction values, reduce
  4745. // to a nearby power-of-2. Can safely generate oversized
  4746. // vectors and rely on the backend to split them to legal sizes.
  4747. unsigned NumReducedVals = ReducedVals.size();
  4748. if (NumReducedVals < 4)
  4749. return false;
  4750. unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
  4751. Value *VectorizedTree = nullptr;
  4752. IRBuilder<> Builder(ReductionRoot);
  4753. FastMathFlags Unsafe;
  4754. Unsafe.setFast();
  4755. Builder.setFastMathFlags(Unsafe);
  4756. unsigned i = 0;
  4757. BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
  4758. // The same extra argument may be used several time, so log each attempt
  4759. // to use it.
  4760. for (auto &Pair : ExtraArgs)
  4761. ExternallyUsedValues[Pair.second].push_back(Pair.first);
  4762. SmallVector<Value *, 16> IgnoreList;
  4763. for (auto &V : ReductionOps)
  4764. IgnoreList.append(V.begin(), V.end());
  4765. while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
  4766. auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
  4767. V.buildTree(VL, ExternallyUsedValues, IgnoreList);
  4768. if (V.shouldReorder()) {
  4769. SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
  4770. V.buildTree(Reversed, ExternallyUsedValues, IgnoreList);
  4771. }
  4772. if (V.isTreeTinyAndNotFullyVectorizable())
  4773. break;
  4774. V.computeMinimumValueSizes();
  4775. // Estimate cost.
  4776. int Cost =
  4777. V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
  4778. if (Cost >= -SLPCostThreshold) {
  4779. V.getORE()->emit([&]() {
  4780. return OptimizationRemarkMissed(
  4781. SV_NAME, "HorSLPNotBeneficial", cast<Instruction>(VL[0]))
  4782. << "Vectorizing horizontal reduction is possible"
  4783. << "but not beneficial with cost "
  4784. << ore::NV("Cost", Cost) << " and threshold "
  4785. << ore::NV("Threshold", -SLPCostThreshold);
  4786. });
  4787. break;
  4788. }
  4789. DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
  4790. << ". (HorRdx)\n");
  4791. V.getORE()->emit([&]() {
  4792. return OptimizationRemark(
  4793. SV_NAME, "VectorizedHorizontalReduction", cast<Instruction>(VL[0]))
  4794. << "Vectorized horizontal reduction with cost "
  4795. << ore::NV("Cost", Cost) << " and with tree size "
  4796. << ore::NV("TreeSize", V.getTreeSize());
  4797. });
  4798. // Vectorize a tree.
  4799. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
  4800. Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
  4801. // Emit a reduction.
  4802. Value *ReducedSubTree =
  4803. emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
  4804. if (VectorizedTree) {
  4805. Builder.SetCurrentDebugLocation(Loc);
  4806. OperationData VectReductionData(ReductionData.getOpcode(),
  4807. VectorizedTree, ReducedSubTree,
  4808. ReductionData.getKind());
  4809. VectorizedTree =
  4810. VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
  4811. } else
  4812. VectorizedTree = ReducedSubTree;
  4813. i += ReduxWidth;
  4814. ReduxWidth = PowerOf2Floor(NumReducedVals - i);
  4815. }
  4816. if (VectorizedTree) {
  4817. // Finish the reduction.
  4818. for (; i < NumReducedVals; ++i) {
  4819. auto *I = cast<Instruction>(ReducedVals[i]);
  4820. Builder.SetCurrentDebugLocation(I->getDebugLoc());
  4821. OperationData VectReductionData(ReductionData.getOpcode(),
  4822. VectorizedTree, I,
  4823. ReductionData.getKind());
  4824. VectorizedTree = VectReductionData.createOp(Builder, "", ReductionOps);
  4825. }
  4826. for (auto &Pair : ExternallyUsedValues) {
  4827. assert(!Pair.second.empty() &&
  4828. "At least one DebugLoc must be inserted");
  4829. // Add each externally used value to the final reduction.
  4830. for (auto *I : Pair.second) {
  4831. Builder.SetCurrentDebugLocation(I->getDebugLoc());
  4832. OperationData VectReductionData(ReductionData.getOpcode(),
  4833. VectorizedTree, Pair.first,
  4834. ReductionData.getKind());
  4835. VectorizedTree = VectReductionData.createOp(Builder, "op.extra", I);
  4836. }
  4837. }
  4838. // Update users.
  4839. ReductionRoot->replaceAllUsesWith(VectorizedTree);
  4840. }
  4841. return VectorizedTree != nullptr;
  4842. }
  4843. unsigned numReductionValues() const {
  4844. return ReducedVals.size();
  4845. }
  4846. private:
  4847. /// \brief Calculate the cost of a reduction.
  4848. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
  4849. unsigned ReduxWidth) {
  4850. Type *ScalarTy = FirstReducedVal->getType();
  4851. Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
  4852. int PairwiseRdxCost;
  4853. int SplittingRdxCost;
  4854. switch (ReductionData.getKind()) {
  4855. case RK_Arithmetic:
  4856. PairwiseRdxCost =
  4857. TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
  4858. /*IsPairwiseForm=*/true);
  4859. SplittingRdxCost =
  4860. TTI->getArithmeticReductionCost(ReductionData.getOpcode(), VecTy,
  4861. /*IsPairwiseForm=*/false);
  4862. break;
  4863. case RK_Min:
  4864. case RK_Max:
  4865. case RK_UMin:
  4866. case RK_UMax: {
  4867. Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
  4868. bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
  4869. ReductionData.getKind() == RK_UMax;
  4870. PairwiseRdxCost =
  4871. TTI->getMinMaxReductionCost(VecTy, VecCondTy,
  4872. /*IsPairwiseForm=*/true, IsUnsigned);
  4873. SplittingRdxCost =
  4874. TTI->getMinMaxReductionCost(VecTy, VecCondTy,
  4875. /*IsPairwiseForm=*/false, IsUnsigned);
  4876. break;
  4877. }
  4878. case RK_None:
  4879. llvm_unreachable("Expected arithmetic or min/max reduction operation");
  4880. }
  4881. IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
  4882. int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
  4883. int ScalarReduxCost;
  4884. switch (ReductionData.getKind()) {
  4885. case RK_Arithmetic:
  4886. ScalarReduxCost =
  4887. TTI->getArithmeticInstrCost(ReductionData.getOpcode(), ScalarTy);
  4888. break;
  4889. case RK_Min:
  4890. case RK_Max:
  4891. case RK_UMin:
  4892. case RK_UMax:
  4893. ScalarReduxCost =
  4894. TTI->getCmpSelInstrCost(ReductionData.getOpcode(), ScalarTy) +
  4895. TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
  4896. CmpInst::makeCmpResultType(ScalarTy));
  4897. break;
  4898. case RK_None:
  4899. llvm_unreachable("Expected arithmetic or min/max reduction operation");
  4900. }
  4901. ScalarReduxCost *= (ReduxWidth - 1);
  4902. DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
  4903. << " for reduction that starts with " << *FirstReducedVal
  4904. << " (It is a "
  4905. << (IsPairwiseReduction ? "pairwise" : "splitting")
  4906. << " reduction)\n");
  4907. return VecReduxCost - ScalarReduxCost;
  4908. }
  4909. /// \brief Emit a horizontal reduction of the vectorized value.
  4910. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
  4911. unsigned ReduxWidth, const TargetTransformInfo *TTI) {
  4912. assert(VectorizedValue && "Need to have a vectorized tree node");
  4913. assert(isPowerOf2_32(ReduxWidth) &&
  4914. "We only handle power-of-two reductions for now");
  4915. if (!IsPairwiseReduction)
  4916. return createSimpleTargetReduction(
  4917. Builder, TTI, ReductionData.getOpcode(), VectorizedValue,
  4918. ReductionData.getFlags(), ReductionOps.back());
  4919. Value *TmpVec = VectorizedValue;
  4920. for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
  4921. Value *LeftMask =
  4922. createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
  4923. Value *RightMask =
  4924. createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
  4925. Value *LeftShuf = Builder.CreateShuffleVector(
  4926. TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
  4927. Value *RightShuf = Builder.CreateShuffleVector(
  4928. TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
  4929. "rdx.shuf.r");
  4930. OperationData VectReductionData(ReductionData.getOpcode(), LeftShuf,
  4931. RightShuf, ReductionData.getKind());
  4932. TmpVec = VectReductionData.createOp(Builder, "op.rdx", ReductionOps);
  4933. }
  4934. // The result is in the first element of the vector.
  4935. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  4936. }
  4937. };
  4938. } // end anonymous namespace
  4939. /// \brief Recognize construction of vectors like
  4940. /// %ra = insertelement <4 x float> undef, float %s0, i32 0
  4941. /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
  4942. /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
  4943. /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
  4944. /// starting from the last insertelement instruction.
  4945. ///
  4946. /// Returns true if it matches
  4947. static bool findBuildVector(InsertElementInst *LastInsertElem,
  4948. SmallVectorImpl<Value *> &BuildVector,
  4949. SmallVectorImpl<Value *> &BuildVectorOpds) {
  4950. Value *V = nullptr;
  4951. do {
  4952. BuildVector.push_back(LastInsertElem);
  4953. BuildVectorOpds.push_back(LastInsertElem->getOperand(1));
  4954. V = LastInsertElem->getOperand(0);
  4955. if (isa<UndefValue>(V))
  4956. break;
  4957. LastInsertElem = dyn_cast<InsertElementInst>(V);
  4958. if (!LastInsertElem || !LastInsertElem->hasOneUse())
  4959. return false;
  4960. } while (true);
  4961. std::reverse(BuildVector.begin(), BuildVector.end());
  4962. std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  4963. return true;
  4964. }
  4965. /// \brief Like findBuildVector, but looks for construction of aggregate.
  4966. ///
  4967. /// \return true if it matches.
  4968. static bool findBuildAggregate(InsertValueInst *IV,
  4969. SmallVectorImpl<Value *> &BuildVector,
  4970. SmallVectorImpl<Value *> &BuildVectorOpds) {
  4971. Value *V;
  4972. do {
  4973. BuildVector.push_back(IV);
  4974. BuildVectorOpds.push_back(IV->getInsertedValueOperand());
  4975. V = IV->getAggregateOperand();
  4976. if (isa<UndefValue>(V))
  4977. break;
  4978. IV = dyn_cast<InsertValueInst>(V);
  4979. if (!IV || !IV->hasOneUse())
  4980. return false;
  4981. } while (true);
  4982. std::reverse(BuildVector.begin(), BuildVector.end());
  4983. std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  4984. return true;
  4985. }
  4986. static bool PhiTypeSorterFunc(Value *V, Value *V2) {
  4987. return V->getType() < V2->getType();
  4988. }
  4989. /// \brief Try and get a reduction value from a phi node.
  4990. ///
  4991. /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
  4992. /// if they come from either \p ParentBB or a containing loop latch.
  4993. ///
  4994. /// \returns A candidate reduction value if possible, or \code nullptr \endcode
  4995. /// if not possible.
  4996. static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
  4997. BasicBlock *ParentBB, LoopInfo *LI) {
  4998. // There are situations where the reduction value is not dominated by the
  4999. // reduction phi. Vectorizing such cases has been reported to cause
  5000. // miscompiles. See PR25787.
  5001. auto DominatedReduxValue = [&](Value *R) {
  5002. return (
  5003. dyn_cast<Instruction>(R) &&
  5004. DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
  5005. };
  5006. Value *Rdx = nullptr;
  5007. // Return the incoming value if it comes from the same BB as the phi node.
  5008. if (P->getIncomingBlock(0) == ParentBB) {
  5009. Rdx = P->getIncomingValue(0);
  5010. } else if (P->getIncomingBlock(1) == ParentBB) {
  5011. Rdx = P->getIncomingValue(1);
  5012. }
  5013. if (Rdx && DominatedReduxValue(Rdx))
  5014. return Rdx;
  5015. // Otherwise, check whether we have a loop latch to look at.
  5016. Loop *BBL = LI->getLoopFor(ParentBB);
  5017. if (!BBL)
  5018. return nullptr;
  5019. BasicBlock *BBLatch = BBL->getLoopLatch();
  5020. if (!BBLatch)
  5021. return nullptr;
  5022. // There is a loop latch, return the incoming value if it comes from
  5023. // that. This reduction pattern occasionally turns up.
  5024. if (P->getIncomingBlock(0) == BBLatch) {
  5025. Rdx = P->getIncomingValue(0);
  5026. } else if (P->getIncomingBlock(1) == BBLatch) {
  5027. Rdx = P->getIncomingValue(1);
  5028. }
  5029. if (Rdx && DominatedReduxValue(Rdx))
  5030. return Rdx;
  5031. return nullptr;
  5032. }
  5033. /// Attempt to reduce a horizontal reduction.
  5034. /// If it is legal to match a horizontal reduction feeding the phi node \a P
  5035. /// with reduction operators \a Root (or one of its operands) in a basic block
  5036. /// \a BB, then check if it can be done. If horizontal reduction is not found
  5037. /// and root instruction is a binary operation, vectorization of the operands is
  5038. /// attempted.
  5039. /// \returns true if a horizontal reduction was matched and reduced or operands
  5040. /// of one of the binary instruction were vectorized.
  5041. /// \returns false if a horizontal reduction was not matched (or not possible)
  5042. /// or no vectorization of any binary operation feeding \a Root instruction was
  5043. /// performed.
  5044. static bool tryToVectorizeHorReductionOrInstOperands(
  5045. PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
  5046. TargetTransformInfo *TTI,
  5047. const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
  5048. if (!ShouldVectorizeHor)
  5049. return false;
  5050. if (!Root)
  5051. return false;
  5052. if (Root->getParent() != BB || isa<PHINode>(Root))
  5053. return false;
  5054. // Start analysis starting from Root instruction. If horizontal reduction is
  5055. // found, try to vectorize it. If it is not a horizontal reduction or
  5056. // vectorization is not possible or not effective, and currently analyzed
  5057. // instruction is a binary operation, try to vectorize the operands, using
  5058. // pre-order DFS traversal order. If the operands were not vectorized, repeat
  5059. // the same procedure considering each operand as a possible root of the
  5060. // horizontal reduction.
  5061. // Interrupt the process if the Root instruction itself was vectorized or all
  5062. // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
  5063. SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
  5064. SmallSet<Value *, 8> VisitedInstrs;
  5065. bool Res = false;
  5066. while (!Stack.empty()) {
  5067. Value *V;
  5068. unsigned Level;
  5069. std::tie(V, Level) = Stack.pop_back_val();
  5070. if (!V)
  5071. continue;
  5072. auto *Inst = dyn_cast<Instruction>(V);
  5073. if (!Inst)
  5074. continue;
  5075. auto *BI = dyn_cast<BinaryOperator>(Inst);
  5076. auto *SI = dyn_cast<SelectInst>(Inst);
  5077. if (BI || SI) {
  5078. HorizontalReduction HorRdx;
  5079. if (HorRdx.matchAssociativeReduction(P, Inst)) {
  5080. if (HorRdx.tryToReduce(R, TTI)) {
  5081. Res = true;
  5082. // Set P to nullptr to avoid re-analysis of phi node in
  5083. // matchAssociativeReduction function unless this is the root node.
  5084. P = nullptr;
  5085. continue;
  5086. }
  5087. }
  5088. if (P && BI) {
  5089. Inst = dyn_cast<Instruction>(BI->getOperand(0));
  5090. if (Inst == P)
  5091. Inst = dyn_cast<Instruction>(BI->getOperand(1));
  5092. if (!Inst) {
  5093. // Set P to nullptr to avoid re-analysis of phi node in
  5094. // matchAssociativeReduction function unless this is the root node.
  5095. P = nullptr;
  5096. continue;
  5097. }
  5098. }
  5099. }
  5100. // Set P to nullptr to avoid re-analysis of phi node in
  5101. // matchAssociativeReduction function unless this is the root node.
  5102. P = nullptr;
  5103. if (Vectorize(Inst, R)) {
  5104. Res = true;
  5105. continue;
  5106. }
  5107. // Try to vectorize operands.
  5108. // Continue analysis for the instruction from the same basic block only to
  5109. // save compile time.
  5110. if (++Level < RecursionMaxDepth)
  5111. for (auto *Op : Inst->operand_values())
  5112. if (VisitedInstrs.insert(Op).second)
  5113. if (auto *I = dyn_cast<Instruction>(Op))
  5114. if (!isa<PHINode>(I) && I->getParent() == BB)
  5115. Stack.emplace_back(Op, Level);
  5116. }
  5117. return Res;
  5118. }
  5119. bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
  5120. BasicBlock *BB, BoUpSLP &R,
  5121. TargetTransformInfo *TTI) {
  5122. if (!V)
  5123. return false;
  5124. auto *I = dyn_cast<Instruction>(V);
  5125. if (!I)
  5126. return false;
  5127. if (!isa<BinaryOperator>(I))
  5128. P = nullptr;
  5129. // Try to match and vectorize a horizontal reduction.
  5130. auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
  5131. return tryToVectorize(I, R);
  5132. };
  5133. return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
  5134. ExtraVectorization);
  5135. }
  5136. bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
  5137. BasicBlock *BB, BoUpSLP &R) {
  5138. const DataLayout &DL = BB->getModule()->getDataLayout();
  5139. if (!R.canMapToVector(IVI->getType(), DL))
  5140. return false;
  5141. SmallVector<Value *, 16> BuildVector;
  5142. SmallVector<Value *, 16> BuildVectorOpds;
  5143. if (!findBuildAggregate(IVI, BuildVector, BuildVectorOpds))
  5144. return false;
  5145. DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
  5146. // Aggregate value is unlikely to be processed in vector register, we need to
  5147. // extract scalars into scalar registers, so NeedExtraction is set true.
  5148. return tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, true);
  5149. }
  5150. bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
  5151. BasicBlock *BB, BoUpSLP &R) {
  5152. SmallVector<Value *, 16> BuildVector;
  5153. SmallVector<Value *, 16> BuildVectorOpds;
  5154. if (!findBuildVector(IEI, BuildVector, BuildVectorOpds))
  5155. return false;
  5156. // Vectorize starting with the build vector operands ignoring the BuildVector
  5157. // instructions for the purpose of scheduling and user extraction.
  5158. return tryToVectorizeList(BuildVectorOpds, R, BuildVector);
  5159. }
  5160. bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
  5161. BoUpSLP &R) {
  5162. if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
  5163. return true;
  5164. bool OpsChanged = false;
  5165. for (int Idx = 0; Idx < 2; ++Idx) {
  5166. OpsChanged |=
  5167. vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
  5168. }
  5169. return OpsChanged;
  5170. }
  5171. bool SLPVectorizerPass::vectorizeSimpleInstructions(
  5172. SmallVectorImpl<WeakVH> &Instructions, BasicBlock *BB, BoUpSLP &R) {
  5173. bool OpsChanged = false;
  5174. for (auto &VH : reverse(Instructions)) {
  5175. auto *I = dyn_cast_or_null<Instruction>(VH);
  5176. if (!I)
  5177. continue;
  5178. if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
  5179. OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
  5180. else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
  5181. OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
  5182. else if (auto *CI = dyn_cast<CmpInst>(I))
  5183. OpsChanged |= vectorizeCmpInst(CI, BB, R);
  5184. }
  5185. Instructions.clear();
  5186. return OpsChanged;
  5187. }
  5188. bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  5189. bool Changed = false;
  5190. SmallVector<Value *, 4> Incoming;
  5191. SmallSet<Value *, 16> VisitedInstrs;
  5192. bool HaveVectorizedPhiNodes = true;
  5193. while (HaveVectorizedPhiNodes) {
  5194. HaveVectorizedPhiNodes = false;
  5195. // Collect the incoming values from the PHIs.
  5196. Incoming.clear();
  5197. for (Instruction &I : *BB) {
  5198. PHINode *P = dyn_cast<PHINode>(&I);
  5199. if (!P)
  5200. break;
  5201. if (!VisitedInstrs.count(P))
  5202. Incoming.push_back(P);
  5203. }
  5204. // Sort by type.
  5205. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
  5206. // Try to vectorize elements base on their type.
  5207. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
  5208. E = Incoming.end();
  5209. IncIt != E;) {
  5210. // Look for the next elements with the same type.
  5211. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
  5212. while (SameTypeIt != E &&
  5213. (*SameTypeIt)->getType() == (*IncIt)->getType()) {
  5214. VisitedInstrs.insert(*SameTypeIt);
  5215. ++SameTypeIt;
  5216. }
  5217. // Try to vectorize them.
  5218. unsigned NumElts = (SameTypeIt - IncIt);
  5219. DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
  5220. // The order in which the phi nodes appear in the program does not matter.
  5221. // So allow tryToVectorizeList to reorder them if it is beneficial. This
  5222. // is done when there are exactly two elements since tryToVectorizeList
  5223. // asserts that there are only two values when AllowReorder is true.
  5224. bool AllowReorder = NumElts == 2;
  5225. if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
  5226. None, AllowReorder)) {
  5227. // Success start over because instructions might have been changed.
  5228. HaveVectorizedPhiNodes = true;
  5229. Changed = true;
  5230. break;
  5231. }
  5232. // Start over at the next instruction of a different type (or the end).
  5233. IncIt = SameTypeIt;
  5234. }
  5235. }
  5236. VisitedInstrs.clear();
  5237. SmallVector<WeakVH, 8> PostProcessInstructions;
  5238. SmallDenseSet<Instruction *, 4> KeyNodes;
  5239. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
  5240. // We may go through BB multiple times so skip the one we have checked.
  5241. if (!VisitedInstrs.insert(&*it).second) {
  5242. if (it->use_empty() && KeyNodes.count(&*it) > 0 &&
  5243. vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
  5244. // We would like to start over since some instructions are deleted
  5245. // and the iterator may become invalid value.
  5246. Changed = true;
  5247. it = BB->begin();
  5248. e = BB->end();
  5249. }
  5250. continue;
  5251. }
  5252. if (isa<DbgInfoIntrinsic>(it))
  5253. continue;
  5254. // Try to vectorize reductions that use PHINodes.
  5255. if (PHINode *P = dyn_cast<PHINode>(it)) {
  5256. // Check that the PHI is a reduction PHI.
  5257. if (P->getNumIncomingValues() != 2)
  5258. return Changed;
  5259. // Try to match and vectorize a horizontal reduction.
  5260. if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
  5261. TTI)) {
  5262. Changed = true;
  5263. it = BB->begin();
  5264. e = BB->end();
  5265. continue;
  5266. }
  5267. continue;
  5268. }
  5269. // Ran into an instruction without users, like terminator, or function call
  5270. // with ignored return value, store. Ignore unused instructions (basing on
  5271. // instruction type, except for CallInst and InvokeInst).
  5272. if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
  5273. isa<InvokeInst>(it))) {
  5274. KeyNodes.insert(&*it);
  5275. bool OpsChanged = false;
  5276. if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
  5277. for (auto *V : it->operand_values()) {
  5278. // Try to match and vectorize a horizontal reduction.
  5279. OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
  5280. }
  5281. }
  5282. // Start vectorization of post-process list of instructions from the
  5283. // top-tree instructions to try to vectorize as many instructions as
  5284. // possible.
  5285. OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
  5286. if (OpsChanged) {
  5287. // We would like to start over since some instructions are deleted
  5288. // and the iterator may become invalid value.
  5289. Changed = true;
  5290. it = BB->begin();
  5291. e = BB->end();
  5292. continue;
  5293. }
  5294. }
  5295. if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
  5296. isa<InsertValueInst>(it))
  5297. PostProcessInstructions.push_back(&*it);
  5298. }
  5299. return Changed;
  5300. }
  5301. bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
  5302. auto Changed = false;
  5303. for (auto &Entry : GEPs) {
  5304. // If the getelementptr list has fewer than two elements, there's nothing
  5305. // to do.
  5306. if (Entry.second.size() < 2)
  5307. continue;
  5308. DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
  5309. << Entry.second.size() << ".\n");
  5310. // We process the getelementptr list in chunks of 16 (like we do for
  5311. // stores) to minimize compile-time.
  5312. for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
  5313. auto Len = std::min<unsigned>(BE - BI, 16);
  5314. auto GEPList = makeArrayRef(&Entry.second[BI], Len);
  5315. // Initialize a set a candidate getelementptrs. Note that we use a
  5316. // SetVector here to preserve program order. If the index computations
  5317. // are vectorizable and begin with loads, we want to minimize the chance
  5318. // of having to reorder them later.
  5319. SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
  5320. // Some of the candidates may have already been vectorized after we
  5321. // initially collected them. If so, the WeakTrackingVHs will have
  5322. // nullified the
  5323. // values, so remove them from the set of candidates.
  5324. Candidates.remove(nullptr);
  5325. // Remove from the set of candidates all pairs of getelementptrs with
  5326. // constant differences. Such getelementptrs are likely not good
  5327. // candidates for vectorization in a bottom-up phase since one can be
  5328. // computed from the other. We also ensure all candidate getelementptr
  5329. // indices are unique.
  5330. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
  5331. auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
  5332. if (!Candidates.count(GEPI))
  5333. continue;
  5334. auto *SCEVI = SE->getSCEV(GEPList[I]);
  5335. for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
  5336. auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
  5337. auto *SCEVJ = SE->getSCEV(GEPList[J]);
  5338. if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
  5339. Candidates.remove(GEPList[I]);
  5340. Candidates.remove(GEPList[J]);
  5341. } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
  5342. Candidates.remove(GEPList[J]);
  5343. }
  5344. }
  5345. }
  5346. // We break out of the above computation as soon as we know there are
  5347. // fewer than two candidates remaining.
  5348. if (Candidates.size() < 2)
  5349. continue;
  5350. // Add the single, non-constant index of each candidate to the bundle. We
  5351. // ensured the indices met these constraints when we originally collected
  5352. // the getelementptrs.
  5353. SmallVector<Value *, 16> Bundle(Candidates.size());
  5354. auto BundleIndex = 0u;
  5355. for (auto *V : Candidates) {
  5356. auto *GEP = cast<GetElementPtrInst>(V);
  5357. auto *GEPIdx = GEP->idx_begin()->get();
  5358. assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
  5359. Bundle[BundleIndex++] = GEPIdx;
  5360. }
  5361. // Try and vectorize the indices. We are currently only interested in
  5362. // gather-like cases of the form:
  5363. //
  5364. // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
  5365. //
  5366. // where the loads of "a", the loads of "b", and the subtractions can be
  5367. // performed in parallel. It's likely that detecting this pattern in a
  5368. // bottom-up phase will be simpler and less costly than building a
  5369. // full-blown top-down phase beginning at the consecutive loads.
  5370. Changed |= tryToVectorizeList(Bundle, R);
  5371. }
  5372. }
  5373. return Changed;
  5374. }
  5375. bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
  5376. bool Changed = false;
  5377. // Attempt to sort and vectorize each of the store-groups.
  5378. for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
  5379. ++it) {
  5380. if (it->second.size() < 2)
  5381. continue;
  5382. DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
  5383. << it->second.size() << ".\n");
  5384. // Process the stores in chunks of 16.
  5385. // TODO: The limit of 16 inhibits greater vectorization factors.
  5386. // For example, AVX2 supports v32i8. Increasing this limit, however,
  5387. // may cause a significant compile-time increase.
  5388. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
  5389. unsigned Len = std::min<unsigned>(CE - CI, 16);
  5390. Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
  5391. }
  5392. }
  5393. return Changed;
  5394. }
  5395. char SLPVectorizer::ID = 0;
  5396. static const char lv_name[] = "SLP Vectorizer";
  5397. INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  5398. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  5399. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  5400. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  5401. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  5402. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  5403. INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
  5404. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  5405. INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
  5406. Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }