SLPVectorizer.cpp 186 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164
  1. //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10. // stores that can be put together into vector-stores. Next, it attempts to
  11. // construct vectorizable tree using the use-def chains. If a profitable tree
  12. // was found, the SLP vectorizer performs vectorization on the tree.
  13. //
  14. // The pass is inspired by the work described in the paper:
  15. // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
  19. #include "llvm/ADT/Optional.h"
  20. #include "llvm/ADT/PostOrderIterator.h"
  21. #include "llvm/ADT/SetVector.h"
  22. #include "llvm/ADT/Statistic.h"
  23. #include "llvm/Analysis/CodeMetrics.h"
  24. #include "llvm/Analysis/GlobalsModRef.h"
  25. #include "llvm/Analysis/LoopAccessAnalysis.h"
  26. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  27. #include "llvm/Analysis/ValueTracking.h"
  28. #include "llvm/Analysis/VectorUtils.h"
  29. #include "llvm/IR/DataLayout.h"
  30. #include "llvm/IR/Dominators.h"
  31. #include "llvm/IR/IRBuilder.h"
  32. #include "llvm/IR/Instructions.h"
  33. #include "llvm/IR/IntrinsicInst.h"
  34. #include "llvm/IR/Module.h"
  35. #include "llvm/IR/NoFolder.h"
  36. #include "llvm/IR/Type.h"
  37. #include "llvm/IR/Value.h"
  38. #include "llvm/IR/Verifier.h"
  39. #include "llvm/Pass.h"
  40. #include "llvm/Support/CommandLine.h"
  41. #include "llvm/Support/Debug.h"
  42. #include "llvm/Support/GraphWriter.h"
  43. #include "llvm/Support/KnownBits.h"
  44. #include "llvm/Support/raw_ostream.h"
  45. #include "llvm/Transforms/Utils/LoopUtils.h"
  46. #include "llvm/Transforms/Vectorize.h"
  47. #include <algorithm>
  48. #include <memory>
  49. using namespace llvm;
  50. using namespace slpvectorizer;
  51. #define SV_NAME "slp-vectorizer"
  52. #define DEBUG_TYPE "SLP"
  53. STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
  54. static cl::opt<int>
  55. SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  56. cl::desc("Only vectorize if you gain more than this "
  57. "number "));
  58. static cl::opt<bool>
  59. ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
  60. cl::desc("Attempt to vectorize horizontal reductions"));
  61. static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  62. "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  63. cl::desc(
  64. "Attempt to vectorize horizontal reductions feeding into a store"));
  65. static cl::opt<int>
  66. MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
  67. cl::desc("Attempt to vectorize for this register size in bits"));
  68. /// Limits the size of scheduling regions in a block.
  69. /// It avoid long compile times for _very_ large blocks where vector
  70. /// instructions are spread over a wide range.
  71. /// This limit is way higher than needed by real-world functions.
  72. static cl::opt<int>
  73. ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
  74. cl::desc("Limit the size of the SLP scheduling region per block"));
  75. static cl::opt<int> MinVectorRegSizeOption(
  76. "slp-min-reg-size", cl::init(128), cl::Hidden,
  77. cl::desc("Attempt to vectorize for this register size in bits"));
  78. static cl::opt<unsigned> RecursionMaxDepth(
  79. "slp-recursion-max-depth", cl::init(12), cl::Hidden,
  80. cl::desc("Limit the recursion depth when building a vectorizable tree"));
  81. static cl::opt<unsigned> MinTreeSize(
  82. "slp-min-tree-size", cl::init(3), cl::Hidden,
  83. cl::desc("Only vectorize small trees if they are fully vectorizable"));
  84. static cl::opt<bool>
  85. ViewSLPTree("view-slp-tree", cl::Hidden,
  86. cl::desc("Display the SLP trees with Graphviz"));
  87. // Limit the number of alias checks. The limit is chosen so that
  88. // it has no negative effect on the llvm benchmarks.
  89. static const unsigned AliasedCheckLimit = 10;
  90. // Another limit for the alias checks: The maximum distance between load/store
  91. // instructions where alias checks are done.
  92. // This limit is useful for very large basic blocks.
  93. static const unsigned MaxMemDepDistance = 160;
  94. /// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
  95. /// regions to be handled.
  96. static const int MinScheduleRegionSize = 16;
  97. /// \brief Predicate for the element types that the SLP vectorizer supports.
  98. ///
  99. /// The most important thing to filter here are types which are invalid in LLVM
  100. /// vectors. We also filter target specific types which have absolutely no
  101. /// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
  102. /// avoids spending time checking the cost model and realizing that they will
  103. /// be inevitably scalarized.
  104. static bool isValidElementType(Type *Ty) {
  105. return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
  106. !Ty->isPPC_FP128Ty();
  107. }
  108. /// \returns true if all of the instructions in \p VL are in the same block or
  109. /// false otherwise.
  110. static bool allSameBlock(ArrayRef<Value *> VL) {
  111. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  112. if (!I0)
  113. return false;
  114. BasicBlock *BB = I0->getParent();
  115. for (int i = 1, e = VL.size(); i < e; i++) {
  116. Instruction *I = dyn_cast<Instruction>(VL[i]);
  117. if (!I)
  118. return false;
  119. if (BB != I->getParent())
  120. return false;
  121. }
  122. return true;
  123. }
  124. /// \returns True if all of the values in \p VL are constants.
  125. static bool allConstant(ArrayRef<Value *> VL) {
  126. for (Value *i : VL)
  127. if (!isa<Constant>(i))
  128. return false;
  129. return true;
  130. }
  131. /// \returns True if all of the values in \p VL are identical.
  132. static bool isSplat(ArrayRef<Value *> VL) {
  133. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  134. if (VL[i] != VL[0])
  135. return false;
  136. return true;
  137. }
  138. ///\returns Opcode that can be clubbed with \p Op to create an alternate
  139. /// sequence which can later be merged as a ShuffleVector instruction.
  140. static unsigned getAltOpcode(unsigned Op) {
  141. switch (Op) {
  142. case Instruction::FAdd:
  143. return Instruction::FSub;
  144. case Instruction::FSub:
  145. return Instruction::FAdd;
  146. case Instruction::Add:
  147. return Instruction::Sub;
  148. case Instruction::Sub:
  149. return Instruction::Add;
  150. default:
  151. return 0;
  152. }
  153. }
  154. /// true if the \p Value is odd, false otherwise.
  155. static bool isOdd(unsigned Value) {
  156. return Value & 1;
  157. }
  158. ///\returns bool representing if Opcode \p Op can be part
  159. /// of an alternate sequence which can later be merged as
  160. /// a ShuffleVector instruction.
  161. static bool canCombineAsAltInst(unsigned Op) {
  162. return Op == Instruction::FAdd || Op == Instruction::FSub ||
  163. Op == Instruction::Sub || Op == Instruction::Add;
  164. }
  165. /// \returns ShuffleVector instruction if instructions in \p VL have
  166. /// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
  167. /// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
  168. static unsigned isAltInst(ArrayRef<Value *> VL) {
  169. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  170. unsigned Opcode = I0->getOpcode();
  171. unsigned AltOpcode = getAltOpcode(Opcode);
  172. for (int i = 1, e = VL.size(); i < e; i++) {
  173. Instruction *I = dyn_cast<Instruction>(VL[i]);
  174. if (!I || I->getOpcode() != (isOdd(i) ? AltOpcode : Opcode))
  175. return 0;
  176. }
  177. return Instruction::ShuffleVector;
  178. }
  179. /// \returns The opcode if all of the Instructions in \p VL have the same
  180. /// opcode, or zero.
  181. static unsigned getSameOpcode(ArrayRef<Value *> VL) {
  182. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  183. if (!I0)
  184. return 0;
  185. unsigned Opcode = I0->getOpcode();
  186. for (int i = 1, e = VL.size(); i < e; i++) {
  187. Instruction *I = dyn_cast<Instruction>(VL[i]);
  188. if (!I || Opcode != I->getOpcode()) {
  189. if (canCombineAsAltInst(Opcode) && i == 1)
  190. return isAltInst(VL);
  191. return 0;
  192. }
  193. }
  194. return Opcode;
  195. }
  196. /// \returns true if all of the values in \p VL have the same type or false
  197. /// otherwise.
  198. static bool allSameType(ArrayRef<Value *> VL) {
  199. Type *Ty = VL[0]->getType();
  200. for (int i = 1, e = VL.size(); i < e; i++)
  201. if (VL[i]->getType() != Ty)
  202. return false;
  203. return true;
  204. }
  205. /// \returns True if Extract{Value,Element} instruction extracts element Idx.
  206. static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) {
  207. assert(Opcode == Instruction::ExtractElement ||
  208. Opcode == Instruction::ExtractValue);
  209. if (Opcode == Instruction::ExtractElement) {
  210. ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
  211. return CI && CI->getZExtValue() == Idx;
  212. } else {
  213. ExtractValueInst *EI = cast<ExtractValueInst>(E);
  214. return EI->getNumIndices() == 1 && *EI->idx_begin() == Idx;
  215. }
  216. }
  217. /// \returns True if in-tree use also needs extract. This refers to
  218. /// possible scalar operand in vectorized instruction.
  219. static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
  220. TargetLibraryInfo *TLI) {
  221. unsigned Opcode = UserInst->getOpcode();
  222. switch (Opcode) {
  223. case Instruction::Load: {
  224. LoadInst *LI = cast<LoadInst>(UserInst);
  225. return (LI->getPointerOperand() == Scalar);
  226. }
  227. case Instruction::Store: {
  228. StoreInst *SI = cast<StoreInst>(UserInst);
  229. return (SI->getPointerOperand() == Scalar);
  230. }
  231. case Instruction::Call: {
  232. CallInst *CI = cast<CallInst>(UserInst);
  233. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  234. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  235. return (CI->getArgOperand(1) == Scalar);
  236. }
  237. LLVM_FALLTHROUGH;
  238. }
  239. default:
  240. return false;
  241. }
  242. }
  243. /// \returns the AA location that is being access by the instruction.
  244. static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
  245. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  246. return MemoryLocation::get(SI);
  247. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  248. return MemoryLocation::get(LI);
  249. return MemoryLocation();
  250. }
  251. /// \returns True if the instruction is not a volatile or atomic load/store.
  252. static bool isSimple(Instruction *I) {
  253. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  254. return LI->isSimple();
  255. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  256. return SI->isSimple();
  257. if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
  258. return !MI->isVolatile();
  259. return true;
  260. }
  261. namespace llvm {
  262. namespace slpvectorizer {
  263. /// Bottom Up SLP Vectorizer.
  264. class BoUpSLP {
  265. public:
  266. typedef SmallVector<Value *, 8> ValueList;
  267. typedef SmallVector<Instruction *, 16> InstrList;
  268. typedef SmallPtrSet<Value *, 16> ValueSet;
  269. typedef SmallVector<StoreInst *, 8> StoreList;
  270. typedef MapVector<Value *, SmallVector<Instruction *, 2>>
  271. ExtraValueToDebugLocsMap;
  272. BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
  273. TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
  274. DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
  275. const DataLayout *DL, OptimizationRemarkEmitter *ORE)
  276. : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
  277. SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
  278. DL(DL), ORE(ORE), Builder(Se->getContext()) {
  279. CodeMetrics::collectEphemeralValues(F, AC, EphValues);
  280. // Use the vector register size specified by the target unless overridden
  281. // by a command-line option.
  282. // TODO: It would be better to limit the vectorization factor based on
  283. // data type rather than just register size. For example, x86 AVX has
  284. // 256-bit registers, but it does not support integer operations
  285. // at that width (that requires AVX2).
  286. if (MaxVectorRegSizeOption.getNumOccurrences())
  287. MaxVecRegSize = MaxVectorRegSizeOption;
  288. else
  289. MaxVecRegSize = TTI->getRegisterBitWidth(true);
  290. if (MinVectorRegSizeOption.getNumOccurrences())
  291. MinVecRegSize = MinVectorRegSizeOption;
  292. else
  293. MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
  294. }
  295. /// \brief Vectorize the tree that starts with the elements in \p VL.
  296. /// Returns the vectorized root.
  297. Value *vectorizeTree();
  298. /// Vectorize the tree but with the list of externally used values \p
  299. /// ExternallyUsedValues. Values in this MapVector can be replaced but the
  300. /// generated extractvalue instructions.
  301. Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
  302. /// \returns the cost incurred by unwanted spills and fills, caused by
  303. /// holding live values over call sites.
  304. int getSpillCost();
  305. /// \returns the vectorization cost of the subtree that starts at \p VL.
  306. /// A negative number means that this is profitable.
  307. int getTreeCost();
  308. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  309. /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
  310. void buildTree(ArrayRef<Value *> Roots,
  311. ArrayRef<Value *> UserIgnoreLst = None);
  312. /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
  313. /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
  314. /// into account (anf updating it, if required) list of externally used
  315. /// values stored in \p ExternallyUsedValues.
  316. void buildTree(ArrayRef<Value *> Roots,
  317. ExtraValueToDebugLocsMap &ExternallyUsedValues,
  318. ArrayRef<Value *> UserIgnoreLst = None);
  319. /// Clear the internal data structures that are created by 'buildTree'.
  320. void deleteTree() {
  321. VectorizableTree.clear();
  322. ScalarToTreeEntry.clear();
  323. MustGather.clear();
  324. ExternalUses.clear();
  325. NumLoadsWantToKeepOrder = 0;
  326. NumLoadsWantToChangeOrder = 0;
  327. for (auto &Iter : BlocksSchedules) {
  328. BlockScheduling *BS = Iter.second.get();
  329. BS->clear();
  330. }
  331. MinBWs.clear();
  332. }
  333. unsigned getTreeSize() const { return VectorizableTree.size(); }
  334. /// \brief Perform LICM and CSE on the newly generated gather sequences.
  335. void optimizeGatherSequence();
  336. /// \returns true if it is beneficial to reverse the vector order.
  337. bool shouldReorder() const {
  338. return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
  339. }
  340. /// \return The vector element size in bits to use when vectorizing the
  341. /// expression tree ending at \p V. If V is a store, the size is the width of
  342. /// the stored value. Otherwise, the size is the width of the largest loaded
  343. /// value reaching V. This method is used by the vectorizer to calculate
  344. /// vectorization factors.
  345. unsigned getVectorElementSize(Value *V);
  346. /// Compute the minimum type sizes required to represent the entries in a
  347. /// vectorizable tree.
  348. void computeMinimumValueSizes();
  349. // \returns maximum vector register size as set by TTI or overridden by cl::opt.
  350. unsigned getMaxVecRegSize() const {
  351. return MaxVecRegSize;
  352. }
  353. // \returns minimum vector register size as set by cl::opt.
  354. unsigned getMinVecRegSize() const {
  355. return MinVecRegSize;
  356. }
  357. /// \brief Check if ArrayType or StructType is isomorphic to some VectorType.
  358. ///
  359. /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
  360. unsigned canMapToVector(Type *T, const DataLayout &DL) const;
  361. /// \returns True if the VectorizableTree is both tiny and not fully
  362. /// vectorizable. We do not vectorize such trees.
  363. bool isTreeTinyAndNotFullyVectorizable();
  364. OptimizationRemarkEmitter *getORE() { return ORE; }
  365. private:
  366. struct TreeEntry;
  367. /// Checks if all users of \p I are the part of the vectorization tree.
  368. bool areAllUsersVectorized(Instruction *I) const;
  369. /// \returns the cost of the vectorizable entry.
  370. int getEntryCost(TreeEntry *E);
  371. /// This is the recursive part of buildTree.
  372. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
  373. /// \returns True if the ExtractElement/ExtractValue instructions in VL can
  374. /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
  375. bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const;
  376. /// Vectorize a single entry in the tree.
  377. Value *vectorizeTree(TreeEntry *E);
  378. /// Vectorize a single entry in the tree, starting in \p VL.
  379. Value *vectorizeTree(ArrayRef<Value *> VL);
  380. /// \returns the pointer to the vectorized value if \p VL is already
  381. /// vectorized, or NULL. They may happen in cycles.
  382. Value *alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const;
  383. /// \returns the scalarization cost for this type. Scalarization in this
  384. /// context means the creation of vectors from a group of scalars.
  385. int getGatherCost(Type *Ty);
  386. /// \returns the scalarization cost for this list of values. Assuming that
  387. /// this subtree gets vectorized, we may need to extract the values from the
  388. /// roots. This method calculates the cost of extracting the values.
  389. int getGatherCost(ArrayRef<Value *> VL);
  390. /// \brief Set the Builder insert point to one after the last instruction in
  391. /// the bundle
  392. void setInsertPointAfterBundle(ArrayRef<Value *> VL);
  393. /// \returns a vector from a collection of scalars in \p VL.
  394. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
  395. /// \returns whether the VectorizableTree is fully vectorizable and will
  396. /// be beneficial even the tree height is tiny.
  397. bool isFullyVectorizableTinyTree();
  398. /// \reorder commutative operands in alt shuffle if they result in
  399. /// vectorized code.
  400. void reorderAltShuffleOperands(ArrayRef<Value *> VL,
  401. SmallVectorImpl<Value *> &Left,
  402. SmallVectorImpl<Value *> &Right);
  403. /// \reorder commutative operands to get better probability of
  404. /// generating vectorized code.
  405. void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  406. SmallVectorImpl<Value *> &Left,
  407. SmallVectorImpl<Value *> &Right);
  408. struct TreeEntry {
  409. TreeEntry(std::vector<TreeEntry> &Container)
  410. : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
  411. Container(Container) {}
  412. /// \returns true if the scalars in VL are equal to this entry.
  413. bool isSame(ArrayRef<Value *> VL) const {
  414. assert(VL.size() == Scalars.size() && "Invalid size");
  415. return std::equal(VL.begin(), VL.end(), Scalars.begin());
  416. }
  417. /// A vector of scalars.
  418. ValueList Scalars;
  419. /// The Scalars are vectorized into this value. It is initialized to Null.
  420. Value *VectorizedValue;
  421. /// Do we need to gather this sequence ?
  422. bool NeedToGather;
  423. /// Points back to the VectorizableTree.
  424. ///
  425. /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
  426. /// to be a pointer and needs to be able to initialize the child iterator.
  427. /// Thus we need a reference back to the container to translate the indices
  428. /// to entries.
  429. std::vector<TreeEntry> &Container;
  430. /// The TreeEntry index containing the user of this entry. We can actually
  431. /// have multiple users so the data structure is not truly a tree.
  432. SmallVector<int, 1> UserTreeIndices;
  433. };
  434. /// Create a new VectorizableTree entry.
  435. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
  436. int &UserTreeIdx) {
  437. VectorizableTree.emplace_back(VectorizableTree);
  438. int idx = VectorizableTree.size() - 1;
  439. TreeEntry *Last = &VectorizableTree[idx];
  440. Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
  441. Last->NeedToGather = !Vectorized;
  442. if (Vectorized) {
  443. for (int i = 0, e = VL.size(); i != e; ++i) {
  444. assert(!getTreeEntry(VL[i]) && "Scalar already in tree!");
  445. ScalarToTreeEntry[VL[i]] = idx;
  446. }
  447. } else {
  448. MustGather.insert(VL.begin(), VL.end());
  449. }
  450. if (UserTreeIdx >= 0)
  451. Last->UserTreeIndices.push_back(UserTreeIdx);
  452. UserTreeIdx = idx;
  453. return Last;
  454. }
  455. /// -- Vectorization State --
  456. /// Holds all of the tree entries.
  457. std::vector<TreeEntry> VectorizableTree;
  458. TreeEntry *getTreeEntry(Value *V) {
  459. auto I = ScalarToTreeEntry.find(V);
  460. if (I != ScalarToTreeEntry.end())
  461. return &VectorizableTree[I->second];
  462. return nullptr;
  463. }
  464. const TreeEntry *getTreeEntry(Value *V) const {
  465. auto I = ScalarToTreeEntry.find(V);
  466. if (I != ScalarToTreeEntry.end())
  467. return &VectorizableTree[I->second];
  468. return nullptr;
  469. }
  470. /// Maps a specific scalar to its tree entry.
  471. SmallDenseMap<Value*, int> ScalarToTreeEntry;
  472. /// A list of scalars that we found that we need to keep as scalars.
  473. ValueSet MustGather;
  474. /// This POD struct describes one external user in the vectorized tree.
  475. struct ExternalUser {
  476. ExternalUser (Value *S, llvm::User *U, int L) :
  477. Scalar(S), User(U), Lane(L){}
  478. // Which scalar in our function.
  479. Value *Scalar;
  480. // Which user that uses the scalar.
  481. llvm::User *User;
  482. // Which lane does the scalar belong to.
  483. int Lane;
  484. };
  485. typedef SmallVector<ExternalUser, 16> UserList;
  486. /// Checks if two instructions may access the same memory.
  487. ///
  488. /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
  489. /// is invariant in the calling loop.
  490. bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
  491. Instruction *Inst2) {
  492. // First check if the result is already in the cache.
  493. AliasCacheKey key = std::make_pair(Inst1, Inst2);
  494. Optional<bool> &result = AliasCache[key];
  495. if (result.hasValue()) {
  496. return result.getValue();
  497. }
  498. MemoryLocation Loc2 = getLocation(Inst2, AA);
  499. bool aliased = true;
  500. if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
  501. // Do the alias check.
  502. aliased = AA->alias(Loc1, Loc2);
  503. }
  504. // Store the result in the cache.
  505. result = aliased;
  506. return aliased;
  507. }
  508. typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
  509. /// Cache for alias results.
  510. /// TODO: consider moving this to the AliasAnalysis itself.
  511. DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
  512. /// Removes an instruction from its block and eventually deletes it.
  513. /// It's like Instruction::eraseFromParent() except that the actual deletion
  514. /// is delayed until BoUpSLP is destructed.
  515. /// This is required to ensure that there are no incorrect collisions in the
  516. /// AliasCache, which can happen if a new instruction is allocated at the
  517. /// same address as a previously deleted instruction.
  518. void eraseInstruction(Instruction *I) {
  519. I->removeFromParent();
  520. I->dropAllReferences();
  521. DeletedInstructions.emplace_back(I);
  522. }
  523. /// Temporary store for deleted instructions. Instructions will be deleted
  524. /// eventually when the BoUpSLP is destructed.
  525. SmallVector<unique_value, 8> DeletedInstructions;
  526. /// A list of values that need to extracted out of the tree.
  527. /// This list holds pairs of (Internal Scalar : External User). External User
  528. /// can be nullptr, it means that this Internal Scalar will be used later,
  529. /// after vectorization.
  530. UserList ExternalUses;
  531. /// Values used only by @llvm.assume calls.
  532. SmallPtrSet<const Value *, 32> EphValues;
  533. /// Holds all of the instructions that we gathered.
  534. SetVector<Instruction *> GatherSeq;
  535. /// A list of blocks that we are going to CSE.
  536. SetVector<BasicBlock *> CSEBlocks;
  537. /// Contains all scheduling relevant data for an instruction.
  538. /// A ScheduleData either represents a single instruction or a member of an
  539. /// instruction bundle (= a group of instructions which is combined into a
  540. /// vector instruction).
  541. struct ScheduleData {
  542. // The initial value for the dependency counters. It means that the
  543. // dependencies are not calculated yet.
  544. enum { InvalidDeps = -1 };
  545. ScheduleData()
  546. : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
  547. NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
  548. Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
  549. UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
  550. void init(int BlockSchedulingRegionID) {
  551. FirstInBundle = this;
  552. NextInBundle = nullptr;
  553. NextLoadStore = nullptr;
  554. IsScheduled = false;
  555. SchedulingRegionID = BlockSchedulingRegionID;
  556. UnscheduledDepsInBundle = UnscheduledDeps;
  557. clearDependencies();
  558. }
  559. /// Returns true if the dependency information has been calculated.
  560. bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
  561. /// Returns true for single instructions and for bundle representatives
  562. /// (= the head of a bundle).
  563. bool isSchedulingEntity() const { return FirstInBundle == this; }
  564. /// Returns true if it represents an instruction bundle and not only a
  565. /// single instruction.
  566. bool isPartOfBundle() const {
  567. return NextInBundle != nullptr || FirstInBundle != this;
  568. }
  569. /// Returns true if it is ready for scheduling, i.e. it has no more
  570. /// unscheduled depending instructions/bundles.
  571. bool isReady() const {
  572. assert(isSchedulingEntity() &&
  573. "can't consider non-scheduling entity for ready list");
  574. return UnscheduledDepsInBundle == 0 && !IsScheduled;
  575. }
  576. /// Modifies the number of unscheduled dependencies, also updating it for
  577. /// the whole bundle.
  578. int incrementUnscheduledDeps(int Incr) {
  579. UnscheduledDeps += Incr;
  580. return FirstInBundle->UnscheduledDepsInBundle += Incr;
  581. }
  582. /// Sets the number of unscheduled dependencies to the number of
  583. /// dependencies.
  584. void resetUnscheduledDeps() {
  585. incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
  586. }
  587. /// Clears all dependency information.
  588. void clearDependencies() {
  589. Dependencies = InvalidDeps;
  590. resetUnscheduledDeps();
  591. MemoryDependencies.clear();
  592. }
  593. void dump(raw_ostream &os) const {
  594. if (!isSchedulingEntity()) {
  595. os << "/ " << *Inst;
  596. } else if (NextInBundle) {
  597. os << '[' << *Inst;
  598. ScheduleData *SD = NextInBundle;
  599. while (SD) {
  600. os << ';' << *SD->Inst;
  601. SD = SD->NextInBundle;
  602. }
  603. os << ']';
  604. } else {
  605. os << *Inst;
  606. }
  607. }
  608. Instruction *Inst;
  609. /// Points to the head in an instruction bundle (and always to this for
  610. /// single instructions).
  611. ScheduleData *FirstInBundle;
  612. /// Single linked list of all instructions in a bundle. Null if it is a
  613. /// single instruction.
  614. ScheduleData *NextInBundle;
  615. /// Single linked list of all memory instructions (e.g. load, store, call)
  616. /// in the block - until the end of the scheduling region.
  617. ScheduleData *NextLoadStore;
  618. /// The dependent memory instructions.
  619. /// This list is derived on demand in calculateDependencies().
  620. SmallVector<ScheduleData *, 4> MemoryDependencies;
  621. /// This ScheduleData is in the current scheduling region if this matches
  622. /// the current SchedulingRegionID of BlockScheduling.
  623. int SchedulingRegionID;
  624. /// Used for getting a "good" final ordering of instructions.
  625. int SchedulingPriority;
  626. /// The number of dependencies. Constitutes of the number of users of the
  627. /// instruction plus the number of dependent memory instructions (if any).
  628. /// This value is calculated on demand.
  629. /// If InvalidDeps, the number of dependencies is not calculated yet.
  630. ///
  631. int Dependencies;
  632. /// The number of dependencies minus the number of dependencies of scheduled
  633. /// instructions. As soon as this is zero, the instruction/bundle gets ready
  634. /// for scheduling.
  635. /// Note that this is negative as long as Dependencies is not calculated.
  636. int UnscheduledDeps;
  637. /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
  638. /// single instructions.
  639. int UnscheduledDepsInBundle;
  640. /// True if this instruction is scheduled (or considered as scheduled in the
  641. /// dry-run).
  642. bool IsScheduled;
  643. };
  644. #ifndef NDEBUG
  645. friend inline raw_ostream &operator<<(raw_ostream &os,
  646. const BoUpSLP::ScheduleData &SD) {
  647. SD.dump(os);
  648. return os;
  649. }
  650. #endif
  651. friend struct GraphTraits<BoUpSLP *>;
  652. friend struct DOTGraphTraits<BoUpSLP *>;
  653. /// Contains all scheduling data for a basic block.
  654. ///
  655. struct BlockScheduling {
  656. BlockScheduling(BasicBlock *BB)
  657. : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
  658. ScheduleStart(nullptr), ScheduleEnd(nullptr),
  659. FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
  660. ScheduleRegionSize(0),
  661. ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
  662. // Make sure that the initial SchedulingRegionID is greater than the
  663. // initial SchedulingRegionID in ScheduleData (which is 0).
  664. SchedulingRegionID(1) {}
  665. void clear() {
  666. ReadyInsts.clear();
  667. ScheduleStart = nullptr;
  668. ScheduleEnd = nullptr;
  669. FirstLoadStoreInRegion = nullptr;
  670. LastLoadStoreInRegion = nullptr;
  671. // Reduce the maximum schedule region size by the size of the
  672. // previous scheduling run.
  673. ScheduleRegionSizeLimit -= ScheduleRegionSize;
  674. if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
  675. ScheduleRegionSizeLimit = MinScheduleRegionSize;
  676. ScheduleRegionSize = 0;
  677. // Make a new scheduling region, i.e. all existing ScheduleData is not
  678. // in the new region yet.
  679. ++SchedulingRegionID;
  680. }
  681. ScheduleData *getScheduleData(Value *V) {
  682. ScheduleData *SD = ScheduleDataMap[V];
  683. if (SD && SD->SchedulingRegionID == SchedulingRegionID)
  684. return SD;
  685. return nullptr;
  686. }
  687. bool isInSchedulingRegion(ScheduleData *SD) {
  688. return SD->SchedulingRegionID == SchedulingRegionID;
  689. }
  690. /// Marks an instruction as scheduled and puts all dependent ready
  691. /// instructions into the ready-list.
  692. template <typename ReadyListType>
  693. void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
  694. SD->IsScheduled = true;
  695. DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
  696. ScheduleData *BundleMember = SD;
  697. while (BundleMember) {
  698. // Handle the def-use chain dependencies.
  699. for (Use &U : BundleMember->Inst->operands()) {
  700. ScheduleData *OpDef = getScheduleData(U.get());
  701. if (OpDef && OpDef->hasValidDependencies() &&
  702. OpDef->incrementUnscheduledDeps(-1) == 0) {
  703. // There are no more unscheduled dependencies after decrementing,
  704. // so we can put the dependent instruction into the ready list.
  705. ScheduleData *DepBundle = OpDef->FirstInBundle;
  706. assert(!DepBundle->IsScheduled &&
  707. "already scheduled bundle gets ready");
  708. ReadyList.insert(DepBundle);
  709. DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
  710. }
  711. }
  712. // Handle the memory dependencies.
  713. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
  714. if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
  715. // There are no more unscheduled dependencies after decrementing,
  716. // so we can put the dependent instruction into the ready list.
  717. ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
  718. assert(!DepBundle->IsScheduled &&
  719. "already scheduled bundle gets ready");
  720. ReadyList.insert(DepBundle);
  721. DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
  722. }
  723. }
  724. BundleMember = BundleMember->NextInBundle;
  725. }
  726. }
  727. /// Put all instructions into the ReadyList which are ready for scheduling.
  728. template <typename ReadyListType>
  729. void initialFillReadyList(ReadyListType &ReadyList) {
  730. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  731. ScheduleData *SD = getScheduleData(I);
  732. if (SD->isSchedulingEntity() && SD->isReady()) {
  733. ReadyList.insert(SD);
  734. DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
  735. }
  736. }
  737. }
  738. /// Checks if a bundle of instructions can be scheduled, i.e. has no
  739. /// cyclic dependencies. This is only a dry-run, no instructions are
  740. /// actually moved at this stage.
  741. bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP, Value *OpValue);
  742. /// Un-bundles a group of instructions.
  743. void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
  744. /// Extends the scheduling region so that V is inside the region.
  745. /// \returns true if the region size is within the limit.
  746. bool extendSchedulingRegion(Value *V);
  747. /// Initialize the ScheduleData structures for new instructions in the
  748. /// scheduling region.
  749. void initScheduleData(Instruction *FromI, Instruction *ToI,
  750. ScheduleData *PrevLoadStore,
  751. ScheduleData *NextLoadStore);
  752. /// Updates the dependency information of a bundle and of all instructions/
  753. /// bundles which depend on the original bundle.
  754. void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
  755. BoUpSLP *SLP);
  756. /// Sets all instruction in the scheduling region to un-scheduled.
  757. void resetSchedule();
  758. BasicBlock *BB;
  759. /// Simple memory allocation for ScheduleData.
  760. std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
  761. /// The size of a ScheduleData array in ScheduleDataChunks.
  762. int ChunkSize;
  763. /// The allocator position in the current chunk, which is the last entry
  764. /// of ScheduleDataChunks.
  765. int ChunkPos;
  766. /// Attaches ScheduleData to Instruction.
  767. /// Note that the mapping survives during all vectorization iterations, i.e.
  768. /// ScheduleData structures are recycled.
  769. DenseMap<Value *, ScheduleData *> ScheduleDataMap;
  770. struct ReadyList : SmallVector<ScheduleData *, 8> {
  771. void insert(ScheduleData *SD) { push_back(SD); }
  772. };
  773. /// The ready-list for scheduling (only used for the dry-run).
  774. ReadyList ReadyInsts;
  775. /// The first instruction of the scheduling region.
  776. Instruction *ScheduleStart;
  777. /// The first instruction _after_ the scheduling region.
  778. Instruction *ScheduleEnd;
  779. /// The first memory accessing instruction in the scheduling region
  780. /// (can be null).
  781. ScheduleData *FirstLoadStoreInRegion;
  782. /// The last memory accessing instruction in the scheduling region
  783. /// (can be null).
  784. ScheduleData *LastLoadStoreInRegion;
  785. /// The current size of the scheduling region.
  786. int ScheduleRegionSize;
  787. /// The maximum size allowed for the scheduling region.
  788. int ScheduleRegionSizeLimit;
  789. /// The ID of the scheduling region. For a new vectorization iteration this
  790. /// is incremented which "removes" all ScheduleData from the region.
  791. int SchedulingRegionID;
  792. };
  793. /// Attaches the BlockScheduling structures to basic blocks.
  794. MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
  795. /// Performs the "real" scheduling. Done before vectorization is actually
  796. /// performed in a basic block.
  797. void scheduleBlock(BlockScheduling *BS);
  798. /// List of users to ignore during scheduling and that don't need extracting.
  799. ArrayRef<Value *> UserIgnoreList;
  800. // Number of load bundles that contain consecutive loads.
  801. int NumLoadsWantToKeepOrder;
  802. // Number of load bundles that contain consecutive loads in reversed order.
  803. int NumLoadsWantToChangeOrder;
  804. // Analysis and block reference.
  805. Function *F;
  806. ScalarEvolution *SE;
  807. TargetTransformInfo *TTI;
  808. TargetLibraryInfo *TLI;
  809. AliasAnalysis *AA;
  810. LoopInfo *LI;
  811. DominatorTree *DT;
  812. AssumptionCache *AC;
  813. DemandedBits *DB;
  814. const DataLayout *DL;
  815. OptimizationRemarkEmitter *ORE;
  816. unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
  817. unsigned MinVecRegSize; // Set by cl::opt (default: 128).
  818. /// Instruction builder to construct the vectorized tree.
  819. IRBuilder<> Builder;
  820. /// A map of scalar integer values to the smallest bit width with which they
  821. /// can legally be represented. The values map to (width, signed) pairs,
  822. /// where "width" indicates the minimum bit width and "signed" is True if the
  823. /// value must be signed-extended, rather than zero-extended, back to its
  824. /// original width.
  825. MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
  826. };
  827. } // end namespace slpvectorizer
  828. template <> struct GraphTraits<BoUpSLP *> {
  829. typedef BoUpSLP::TreeEntry TreeEntry;
  830. /// NodeRef has to be a pointer per the GraphWriter.
  831. typedef TreeEntry *NodeRef;
  832. /// \brief Add the VectorizableTree to the index iterator to be able to return
  833. /// TreeEntry pointers.
  834. struct ChildIteratorType
  835. : public iterator_adaptor_base<ChildIteratorType,
  836. SmallVector<int, 1>::iterator> {
  837. std::vector<TreeEntry> &VectorizableTree;
  838. ChildIteratorType(SmallVector<int, 1>::iterator W,
  839. std::vector<TreeEntry> &VT)
  840. : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
  841. NodeRef operator*() { return &VectorizableTree[*I]; }
  842. };
  843. static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
  844. static ChildIteratorType child_begin(NodeRef N) {
  845. return {N->UserTreeIndices.begin(), N->Container};
  846. }
  847. static ChildIteratorType child_end(NodeRef N) {
  848. return {N->UserTreeIndices.end(), N->Container};
  849. }
  850. /// For the node iterator we just need to turn the TreeEntry iterator into a
  851. /// TreeEntry* iterator so that it dereferences to NodeRef.
  852. typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
  853. static nodes_iterator nodes_begin(BoUpSLP *R) {
  854. return nodes_iterator(R->VectorizableTree.begin());
  855. }
  856. static nodes_iterator nodes_end(BoUpSLP *R) {
  857. return nodes_iterator(R->VectorizableTree.end());
  858. }
  859. static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
  860. };
  861. template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
  862. typedef BoUpSLP::TreeEntry TreeEntry;
  863. DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
  864. std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
  865. std::string Str;
  866. raw_string_ostream OS(Str);
  867. if (isSplat(Entry->Scalars)) {
  868. OS << "<splat> " << *Entry->Scalars[0];
  869. return Str;
  870. }
  871. for (auto V : Entry->Scalars) {
  872. OS << *V;
  873. if (std::any_of(
  874. R->ExternalUses.begin(), R->ExternalUses.end(),
  875. [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
  876. OS << " <extract>";
  877. OS << "\n";
  878. }
  879. return Str;
  880. }
  881. static std::string getNodeAttributes(const TreeEntry *Entry,
  882. const BoUpSLP *) {
  883. if (Entry->NeedToGather)
  884. return "color=red";
  885. return "";
  886. }
  887. };
  888. } // end namespace llvm
  889. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  890. ArrayRef<Value *> UserIgnoreLst) {
  891. ExtraValueToDebugLocsMap ExternallyUsedValues;
  892. buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
  893. }
  894. void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
  895. ExtraValueToDebugLocsMap &ExternallyUsedValues,
  896. ArrayRef<Value *> UserIgnoreLst) {
  897. deleteTree();
  898. UserIgnoreList = UserIgnoreLst;
  899. if (!allSameType(Roots))
  900. return;
  901. buildTree_rec(Roots, 0, -1);
  902. // Collect the values that we need to extract from the tree.
  903. for (TreeEntry &EIdx : VectorizableTree) {
  904. TreeEntry *Entry = &EIdx;
  905. // No need to handle users of gathered values.
  906. if (Entry->NeedToGather)
  907. continue;
  908. // For each lane:
  909. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  910. Value *Scalar = Entry->Scalars[Lane];
  911. // Check if the scalar is externally used as an extra arg.
  912. auto ExtI = ExternallyUsedValues.find(Scalar);
  913. if (ExtI != ExternallyUsedValues.end()) {
  914. DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
  915. Lane << " from " << *Scalar << ".\n");
  916. ExternalUses.emplace_back(Scalar, nullptr, Lane);
  917. continue;
  918. }
  919. for (User *U : Scalar->users()) {
  920. DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
  921. Instruction *UserInst = dyn_cast<Instruction>(U);
  922. if (!UserInst)
  923. continue;
  924. // Skip in-tree scalars that become vectors
  925. if (TreeEntry *UseEntry = getTreeEntry(U)) {
  926. Value *UseScalar = UseEntry->Scalars[0];
  927. // Some in-tree scalars will remain as scalar in vectorized
  928. // instructions. If that is the case, the one in Lane 0 will
  929. // be used.
  930. if (UseScalar != U ||
  931. !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
  932. DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
  933. << ".\n");
  934. assert(!UseEntry->NeedToGather && "Bad state");
  935. continue;
  936. }
  937. }
  938. // Ignore users in the user ignore list.
  939. if (is_contained(UserIgnoreList, UserInst))
  940. continue;
  941. DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
  942. Lane << " from " << *Scalar << ".\n");
  943. ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
  944. }
  945. }
  946. }
  947. }
  948. void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
  949. int UserTreeIdx) {
  950. bool isAltShuffle = false;
  951. assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
  952. if (Depth == RecursionMaxDepth) {
  953. DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
  954. newTreeEntry(VL, false, UserTreeIdx);
  955. return;
  956. }
  957. // Don't handle vectors.
  958. if (VL[0]->getType()->isVectorTy()) {
  959. DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
  960. newTreeEntry(VL, false, UserTreeIdx);
  961. return;
  962. }
  963. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  964. if (SI->getValueOperand()->getType()->isVectorTy()) {
  965. DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
  966. newTreeEntry(VL, false, UserTreeIdx);
  967. return;
  968. }
  969. unsigned Opcode = getSameOpcode(VL);
  970. // Check that this shuffle vector refers to the alternate
  971. // sequence of opcodes.
  972. if (Opcode == Instruction::ShuffleVector) {
  973. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  974. unsigned Op = I0->getOpcode();
  975. if (Op != Instruction::ShuffleVector)
  976. isAltShuffle = true;
  977. }
  978. // If all of the operands are identical or constant we have a simple solution.
  979. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
  980. DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
  981. newTreeEntry(VL, false, UserTreeIdx);
  982. return;
  983. }
  984. // We now know that this is a vector of instructions of the same type from
  985. // the same block.
  986. // Don't vectorize ephemeral values.
  987. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  988. if (EphValues.count(VL[i])) {
  989. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  990. ") is ephemeral.\n");
  991. newTreeEntry(VL, false, UserTreeIdx);
  992. return;
  993. }
  994. }
  995. // Check if this is a duplicate of another entry.
  996. if (TreeEntry *E = getTreeEntry(VL[0])) {
  997. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  998. DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
  999. if (E->Scalars[i] != VL[i]) {
  1000. DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
  1001. newTreeEntry(VL, false, UserTreeIdx);
  1002. return;
  1003. }
  1004. }
  1005. // Record the reuse of the tree node. FIXME, currently this is only used to
  1006. // properly draw the graph rather than for the actual vectorization.
  1007. E->UserTreeIndices.push_back(UserTreeIdx);
  1008. DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
  1009. return;
  1010. }
  1011. // Check that none of the instructions in the bundle are already in the tree.
  1012. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1013. if (ScalarToTreeEntry.count(VL[i])) {
  1014. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  1015. ") is already in tree.\n");
  1016. newTreeEntry(VL, false, UserTreeIdx);
  1017. return;
  1018. }
  1019. }
  1020. // If any of the scalars is marked as a value that needs to stay scalar then
  1021. // we need to gather the scalars.
  1022. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  1023. if (MustGather.count(VL[i])) {
  1024. DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
  1025. newTreeEntry(VL, false, UserTreeIdx);
  1026. return;
  1027. }
  1028. }
  1029. // Check that all of the users of the scalars that we want to vectorize are
  1030. // schedulable.
  1031. Instruction *VL0 = cast<Instruction>(VL[0]);
  1032. BasicBlock *BB = VL0->getParent();
  1033. if (!DT->isReachableFromEntry(BB)) {
  1034. // Don't go into unreachable blocks. They may contain instructions with
  1035. // dependency cycles which confuse the final scheduling.
  1036. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
  1037. newTreeEntry(VL, false, UserTreeIdx);
  1038. return;
  1039. }
  1040. // Check that every instructions appears once in this bundle.
  1041. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  1042. for (unsigned j = i+1; j < e; ++j)
  1043. if (VL[i] == VL[j]) {
  1044. DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
  1045. newTreeEntry(VL, false, UserTreeIdx);
  1046. return;
  1047. }
  1048. auto &BSRef = BlocksSchedules[BB];
  1049. if (!BSRef) {
  1050. BSRef = llvm::make_unique<BlockScheduling>(BB);
  1051. }
  1052. BlockScheduling &BS = *BSRef.get();
  1053. if (!BS.tryScheduleBundle(VL, this, VL0)) {
  1054. DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
  1055. assert((!BS.getScheduleData(VL[0]) ||
  1056. !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
  1057. "tryScheduleBundle should cancelScheduling on failure");
  1058. newTreeEntry(VL, false, UserTreeIdx);
  1059. return;
  1060. }
  1061. DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
  1062. switch (Opcode) {
  1063. case Instruction::PHI: {
  1064. PHINode *PH = dyn_cast<PHINode>(VL0);
  1065. // Check for terminator values (e.g. invoke).
  1066. for (unsigned j = 0; j < VL.size(); ++j)
  1067. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1068. TerminatorInst *Term = dyn_cast<TerminatorInst>(
  1069. cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
  1070. if (Term) {
  1071. DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
  1072. BS.cancelScheduling(VL, VL0);
  1073. newTreeEntry(VL, false, UserTreeIdx);
  1074. return;
  1075. }
  1076. }
  1077. newTreeEntry(VL, true, UserTreeIdx);
  1078. DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
  1079. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1080. ValueList Operands;
  1081. // Prepare the operand vector.
  1082. for (Value *j : VL)
  1083. Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
  1084. PH->getIncomingBlock(i)));
  1085. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1086. }
  1087. return;
  1088. }
  1089. case Instruction::ExtractValue:
  1090. case Instruction::ExtractElement: {
  1091. bool Reuse = canReuseExtract(VL, VL0);
  1092. if (Reuse) {
  1093. DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
  1094. } else {
  1095. BS.cancelScheduling(VL, VL0);
  1096. }
  1097. newTreeEntry(VL, Reuse, UserTreeIdx);
  1098. return;
  1099. }
  1100. case Instruction::Load: {
  1101. // Check that a vectorized load would load the same memory as a scalar
  1102. // load.
  1103. // For example we don't want vectorize loads that are smaller than 8 bit.
  1104. // Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
  1105. // loading/storing it as an i8 struct. If we vectorize loads/stores from
  1106. // such a struct we read/write packed bits disagreeing with the
  1107. // unvectorized version.
  1108. Type *ScalarTy = VL0->getType();
  1109. if (DL->getTypeSizeInBits(ScalarTy) !=
  1110. DL->getTypeAllocSizeInBits(ScalarTy)) {
  1111. BS.cancelScheduling(VL, VL0);
  1112. newTreeEntry(VL, false, UserTreeIdx);
  1113. DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
  1114. return;
  1115. }
  1116. // Make sure all loads in the bundle are simple - we can't vectorize
  1117. // atomic or volatile loads.
  1118. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  1119. LoadInst *L = cast<LoadInst>(VL[i]);
  1120. if (!L->isSimple()) {
  1121. BS.cancelScheduling(VL, VL0);
  1122. newTreeEntry(VL, false, UserTreeIdx);
  1123. DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
  1124. return;
  1125. }
  1126. }
  1127. // Check if the loads are consecutive, reversed, or neither.
  1128. // TODO: What we really want is to sort the loads, but for now, check
  1129. // the two likely directions.
  1130. bool Consecutive = true;
  1131. bool ReverseConsecutive = true;
  1132. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  1133. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1134. Consecutive = false;
  1135. break;
  1136. } else {
  1137. ReverseConsecutive = false;
  1138. }
  1139. }
  1140. if (Consecutive) {
  1141. ++NumLoadsWantToKeepOrder;
  1142. newTreeEntry(VL, true, UserTreeIdx);
  1143. DEBUG(dbgs() << "SLP: added a vector of loads.\n");
  1144. return;
  1145. }
  1146. // If none of the load pairs were consecutive when checked in order,
  1147. // check the reverse order.
  1148. if (ReverseConsecutive)
  1149. for (unsigned i = VL.size() - 1; i > 0; --i)
  1150. if (!isConsecutiveAccess(VL[i], VL[i - 1], *DL, *SE)) {
  1151. ReverseConsecutive = false;
  1152. break;
  1153. }
  1154. BS.cancelScheduling(VL, VL0);
  1155. newTreeEntry(VL, false, UserTreeIdx);
  1156. if (ReverseConsecutive) {
  1157. ++NumLoadsWantToChangeOrder;
  1158. DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
  1159. } else {
  1160. DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
  1161. }
  1162. return;
  1163. }
  1164. case Instruction::ZExt:
  1165. case Instruction::SExt:
  1166. case Instruction::FPToUI:
  1167. case Instruction::FPToSI:
  1168. case Instruction::FPExt:
  1169. case Instruction::PtrToInt:
  1170. case Instruction::IntToPtr:
  1171. case Instruction::SIToFP:
  1172. case Instruction::UIToFP:
  1173. case Instruction::Trunc:
  1174. case Instruction::FPTrunc:
  1175. case Instruction::BitCast: {
  1176. Type *SrcTy = VL0->getOperand(0)->getType();
  1177. for (unsigned i = 0; i < VL.size(); ++i) {
  1178. Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
  1179. if (Ty != SrcTy || !isValidElementType(Ty)) {
  1180. BS.cancelScheduling(VL, VL0);
  1181. newTreeEntry(VL, false, UserTreeIdx);
  1182. DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
  1183. return;
  1184. }
  1185. }
  1186. newTreeEntry(VL, true, UserTreeIdx);
  1187. DEBUG(dbgs() << "SLP: added a vector of casts.\n");
  1188. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1189. ValueList Operands;
  1190. // Prepare the operand vector.
  1191. for (Value *j : VL)
  1192. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1193. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1194. }
  1195. return;
  1196. }
  1197. case Instruction::ICmp:
  1198. case Instruction::FCmp: {
  1199. // Check that all of the compares have the same predicate.
  1200. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  1201. Type *ComparedTy = VL0->getOperand(0)->getType();
  1202. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  1203. CmpInst *Cmp = cast<CmpInst>(VL[i]);
  1204. if (Cmp->getPredicate() != P0 ||
  1205. Cmp->getOperand(0)->getType() != ComparedTy) {
  1206. BS.cancelScheduling(VL, VL0);
  1207. newTreeEntry(VL, false, UserTreeIdx);
  1208. DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
  1209. return;
  1210. }
  1211. }
  1212. newTreeEntry(VL, true, UserTreeIdx);
  1213. DEBUG(dbgs() << "SLP: added a vector of compares.\n");
  1214. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1215. ValueList Operands;
  1216. // Prepare the operand vector.
  1217. for (Value *j : VL)
  1218. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1219. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1220. }
  1221. return;
  1222. }
  1223. case Instruction::Select:
  1224. case Instruction::Add:
  1225. case Instruction::FAdd:
  1226. case Instruction::Sub:
  1227. case Instruction::FSub:
  1228. case Instruction::Mul:
  1229. case Instruction::FMul:
  1230. case Instruction::UDiv:
  1231. case Instruction::SDiv:
  1232. case Instruction::FDiv:
  1233. case Instruction::URem:
  1234. case Instruction::SRem:
  1235. case Instruction::FRem:
  1236. case Instruction::Shl:
  1237. case Instruction::LShr:
  1238. case Instruction::AShr:
  1239. case Instruction::And:
  1240. case Instruction::Or:
  1241. case Instruction::Xor: {
  1242. newTreeEntry(VL, true, UserTreeIdx);
  1243. DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
  1244. // Sort operands of the instructions so that each side is more likely to
  1245. // have the same opcode.
  1246. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
  1247. ValueList Left, Right;
  1248. reorderInputsAccordingToOpcode(VL, Left, Right);
  1249. buildTree_rec(Left, Depth + 1, UserTreeIdx);
  1250. buildTree_rec(Right, Depth + 1, UserTreeIdx);
  1251. return;
  1252. }
  1253. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1254. ValueList Operands;
  1255. // Prepare the operand vector.
  1256. for (Value *j : VL)
  1257. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1258. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1259. }
  1260. return;
  1261. }
  1262. case Instruction::GetElementPtr: {
  1263. // We don't combine GEPs with complicated (nested) indexing.
  1264. for (unsigned j = 0; j < VL.size(); ++j) {
  1265. if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
  1266. DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
  1267. BS.cancelScheduling(VL, VL0);
  1268. newTreeEntry(VL, false, UserTreeIdx);
  1269. return;
  1270. }
  1271. }
  1272. // We can't combine several GEPs into one vector if they operate on
  1273. // different types.
  1274. Type *Ty0 = VL0->getOperand(0)->getType();
  1275. for (unsigned j = 0; j < VL.size(); ++j) {
  1276. Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
  1277. if (Ty0 != CurTy) {
  1278. DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
  1279. BS.cancelScheduling(VL, VL0);
  1280. newTreeEntry(VL, false, UserTreeIdx);
  1281. return;
  1282. }
  1283. }
  1284. // We don't combine GEPs with non-constant indexes.
  1285. for (unsigned j = 0; j < VL.size(); ++j) {
  1286. auto Op = cast<Instruction>(VL[j])->getOperand(1);
  1287. if (!isa<ConstantInt>(Op)) {
  1288. DEBUG(
  1289. dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
  1290. BS.cancelScheduling(VL, VL0);
  1291. newTreeEntry(VL, false, UserTreeIdx);
  1292. return;
  1293. }
  1294. }
  1295. newTreeEntry(VL, true, UserTreeIdx);
  1296. DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
  1297. for (unsigned i = 0, e = 2; i < e; ++i) {
  1298. ValueList Operands;
  1299. // Prepare the operand vector.
  1300. for (Value *j : VL)
  1301. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1302. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1303. }
  1304. return;
  1305. }
  1306. case Instruction::Store: {
  1307. // Check if the stores are consecutive or of we need to swizzle them.
  1308. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
  1309. if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
  1310. BS.cancelScheduling(VL, VL0);
  1311. newTreeEntry(VL, false, UserTreeIdx);
  1312. DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
  1313. return;
  1314. }
  1315. newTreeEntry(VL, true, UserTreeIdx);
  1316. DEBUG(dbgs() << "SLP: added a vector of stores.\n");
  1317. ValueList Operands;
  1318. for (Value *j : VL)
  1319. Operands.push_back(cast<Instruction>(j)->getOperand(0));
  1320. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1321. return;
  1322. }
  1323. case Instruction::Call: {
  1324. // Check if the calls are all to the same vectorizable intrinsic.
  1325. CallInst *CI = cast<CallInst>(VL0);
  1326. // Check if this is an Intrinsic call or something that can be
  1327. // represented by an intrinsic call
  1328. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1329. if (!isTriviallyVectorizable(ID)) {
  1330. BS.cancelScheduling(VL, VL0);
  1331. newTreeEntry(VL, false, UserTreeIdx);
  1332. DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
  1333. return;
  1334. }
  1335. Function *Int = CI->getCalledFunction();
  1336. Value *A1I = nullptr;
  1337. if (hasVectorInstrinsicScalarOpd(ID, 1))
  1338. A1I = CI->getArgOperand(1);
  1339. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1340. CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
  1341. if (!CI2 || CI2->getCalledFunction() != Int ||
  1342. getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
  1343. !CI->hasIdenticalOperandBundleSchema(*CI2)) {
  1344. BS.cancelScheduling(VL, VL0);
  1345. newTreeEntry(VL, false, UserTreeIdx);
  1346. DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
  1347. << "\n");
  1348. return;
  1349. }
  1350. // ctlz,cttz and powi are special intrinsics whose second argument
  1351. // should be same in order for them to be vectorized.
  1352. if (hasVectorInstrinsicScalarOpd(ID, 1)) {
  1353. Value *A1J = CI2->getArgOperand(1);
  1354. if (A1I != A1J) {
  1355. BS.cancelScheduling(VL, VL0);
  1356. newTreeEntry(VL, false, UserTreeIdx);
  1357. DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
  1358. << " argument "<< A1I<<"!=" << A1J
  1359. << "\n");
  1360. return;
  1361. }
  1362. }
  1363. // Verify that the bundle operands are identical between the two calls.
  1364. if (CI->hasOperandBundles() &&
  1365. !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
  1366. CI->op_begin() + CI->getBundleOperandsEndIndex(),
  1367. CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
  1368. BS.cancelScheduling(VL, VL0);
  1369. newTreeEntry(VL, false, UserTreeIdx);
  1370. DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
  1371. << *VL[i] << '\n');
  1372. return;
  1373. }
  1374. }
  1375. newTreeEntry(VL, true, UserTreeIdx);
  1376. for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
  1377. ValueList Operands;
  1378. // Prepare the operand vector.
  1379. for (Value *j : VL) {
  1380. CallInst *CI2 = dyn_cast<CallInst>(j);
  1381. Operands.push_back(CI2->getArgOperand(i));
  1382. }
  1383. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1384. }
  1385. return;
  1386. }
  1387. case Instruction::ShuffleVector: {
  1388. // If this is not an alternate sequence of opcode like add-sub
  1389. // then do not vectorize this instruction.
  1390. if (!isAltShuffle) {
  1391. BS.cancelScheduling(VL, VL0);
  1392. newTreeEntry(VL, false, UserTreeIdx);
  1393. DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
  1394. return;
  1395. }
  1396. newTreeEntry(VL, true, UserTreeIdx);
  1397. DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
  1398. // Reorder operands if reordering would enable vectorization.
  1399. if (isa<BinaryOperator>(VL0)) {
  1400. ValueList Left, Right;
  1401. reorderAltShuffleOperands(VL, Left, Right);
  1402. buildTree_rec(Left, Depth + 1, UserTreeIdx);
  1403. buildTree_rec(Right, Depth + 1, UserTreeIdx);
  1404. return;
  1405. }
  1406. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  1407. ValueList Operands;
  1408. // Prepare the operand vector.
  1409. for (Value *j : VL)
  1410. Operands.push_back(cast<Instruction>(j)->getOperand(i));
  1411. buildTree_rec(Operands, Depth + 1, UserTreeIdx);
  1412. }
  1413. return;
  1414. }
  1415. default:
  1416. BS.cancelScheduling(VL, VL0);
  1417. newTreeEntry(VL, false, UserTreeIdx);
  1418. DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
  1419. return;
  1420. }
  1421. }
  1422. unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
  1423. unsigned N;
  1424. Type *EltTy;
  1425. auto *ST = dyn_cast<StructType>(T);
  1426. if (ST) {
  1427. N = ST->getNumElements();
  1428. EltTy = *ST->element_begin();
  1429. } else {
  1430. N = cast<ArrayType>(T)->getNumElements();
  1431. EltTy = cast<ArrayType>(T)->getElementType();
  1432. }
  1433. if (!isValidElementType(EltTy))
  1434. return 0;
  1435. uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
  1436. if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
  1437. return 0;
  1438. if (ST) {
  1439. // Check that struct is homogeneous.
  1440. for (const auto *Ty : ST->elements())
  1441. if (Ty != EltTy)
  1442. return 0;
  1443. }
  1444. return N;
  1445. }
  1446. bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue) const {
  1447. Instruction *E0 = cast<Instruction>(OpValue);
  1448. assert(E0->getOpcode() == Instruction::ExtractElement ||
  1449. E0->getOpcode() == Instruction::ExtractValue);
  1450. assert(E0->getOpcode() == getSameOpcode(VL) && "Invalid opcode");
  1451. // Check if all of the extracts come from the same vector and from the
  1452. // correct offset.
  1453. Value *Vec = E0->getOperand(0);
  1454. // We have to extract from a vector/aggregate with the same number of elements.
  1455. unsigned NElts;
  1456. if (E0->getOpcode() == Instruction::ExtractValue) {
  1457. const DataLayout &DL = E0->getModule()->getDataLayout();
  1458. NElts = canMapToVector(Vec->getType(), DL);
  1459. if (!NElts)
  1460. return false;
  1461. // Check if load can be rewritten as load of vector.
  1462. LoadInst *LI = dyn_cast<LoadInst>(Vec);
  1463. if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
  1464. return false;
  1465. } else {
  1466. NElts = Vec->getType()->getVectorNumElements();
  1467. }
  1468. if (NElts != VL.size())
  1469. return false;
  1470. // Check that all of the indices extract from the correct offset.
  1471. for (unsigned I = 0, E = VL.size(); I < E; ++I) {
  1472. Instruction *Inst = cast<Instruction>(VL[I]);
  1473. if (!matchExtractIndex(Inst, I, Inst->getOpcode()))
  1474. return false;
  1475. if (Inst->getOperand(0) != Vec)
  1476. return false;
  1477. }
  1478. return true;
  1479. }
  1480. bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
  1481. return I->hasOneUse() ||
  1482. std::all_of(I->user_begin(), I->user_end(), [this](User *U) {
  1483. return ScalarToTreeEntry.count(U) > 0;
  1484. });
  1485. }
  1486. int BoUpSLP::getEntryCost(TreeEntry *E) {
  1487. ArrayRef<Value*> VL = E->Scalars;
  1488. Type *ScalarTy = VL[0]->getType();
  1489. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1490. ScalarTy = SI->getValueOperand()->getType();
  1491. else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
  1492. ScalarTy = CI->getOperand(0)->getType();
  1493. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1494. // If we have computed a smaller type for the expression, update VecTy so
  1495. // that the costs will be accurate.
  1496. if (MinBWs.count(VL[0]))
  1497. VecTy = VectorType::get(
  1498. IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
  1499. if (E->NeedToGather) {
  1500. if (allConstant(VL))
  1501. return 0;
  1502. if (isSplat(VL)) {
  1503. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  1504. }
  1505. return getGatherCost(E->Scalars);
  1506. }
  1507. unsigned Opcode = getSameOpcode(VL);
  1508. assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
  1509. Instruction *VL0 = cast<Instruction>(VL[0]);
  1510. switch (Opcode) {
  1511. case Instruction::PHI: {
  1512. return 0;
  1513. }
  1514. case Instruction::ExtractValue:
  1515. case Instruction::ExtractElement: {
  1516. if (canReuseExtract(VL, VL0)) {
  1517. int DeadCost = 0;
  1518. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1519. Instruction *E = cast<Instruction>(VL[i]);
  1520. // If all users are going to be vectorized, instruction can be
  1521. // considered as dead.
  1522. // The same, if have only one user, it will be vectorized for sure.
  1523. if (areAllUsersVectorized(E))
  1524. // Take credit for instruction that will become dead.
  1525. DeadCost +=
  1526. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
  1527. }
  1528. return -DeadCost;
  1529. }
  1530. return getGatherCost(VecTy);
  1531. }
  1532. case Instruction::ZExt:
  1533. case Instruction::SExt:
  1534. case Instruction::FPToUI:
  1535. case Instruction::FPToSI:
  1536. case Instruction::FPExt:
  1537. case Instruction::PtrToInt:
  1538. case Instruction::IntToPtr:
  1539. case Instruction::SIToFP:
  1540. case Instruction::UIToFP:
  1541. case Instruction::Trunc:
  1542. case Instruction::FPTrunc:
  1543. case Instruction::BitCast: {
  1544. Type *SrcTy = VL0->getOperand(0)->getType();
  1545. // Calculate the cost of this instruction.
  1546. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
  1547. VL0->getType(), SrcTy, VL0);
  1548. VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
  1549. int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
  1550. return VecCost - ScalarCost;
  1551. }
  1552. case Instruction::FCmp:
  1553. case Instruction::ICmp:
  1554. case Instruction::Select: {
  1555. // Calculate the cost of this instruction.
  1556. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
  1557. int ScalarCost = VecTy->getNumElements() *
  1558. TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
  1559. int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
  1560. return VecCost - ScalarCost;
  1561. }
  1562. case Instruction::Add:
  1563. case Instruction::FAdd:
  1564. case Instruction::Sub:
  1565. case Instruction::FSub:
  1566. case Instruction::Mul:
  1567. case Instruction::FMul:
  1568. case Instruction::UDiv:
  1569. case Instruction::SDiv:
  1570. case Instruction::FDiv:
  1571. case Instruction::URem:
  1572. case Instruction::SRem:
  1573. case Instruction::FRem:
  1574. case Instruction::Shl:
  1575. case Instruction::LShr:
  1576. case Instruction::AShr:
  1577. case Instruction::And:
  1578. case Instruction::Or:
  1579. case Instruction::Xor: {
  1580. // Certain instructions can be cheaper to vectorize if they have a
  1581. // constant second vector operand.
  1582. TargetTransformInfo::OperandValueKind Op1VK =
  1583. TargetTransformInfo::OK_AnyValue;
  1584. TargetTransformInfo::OperandValueKind Op2VK =
  1585. TargetTransformInfo::OK_UniformConstantValue;
  1586. TargetTransformInfo::OperandValueProperties Op1VP =
  1587. TargetTransformInfo::OP_None;
  1588. TargetTransformInfo::OperandValueProperties Op2VP =
  1589. TargetTransformInfo::OP_None;
  1590. // If all operands are exactly the same ConstantInt then set the
  1591. // operand kind to OK_UniformConstantValue.
  1592. // If instead not all operands are constants, then set the operand kind
  1593. // to OK_AnyValue. If all operands are constants but not the same,
  1594. // then set the operand kind to OK_NonUniformConstantValue.
  1595. ConstantInt *CInt = nullptr;
  1596. for (unsigned i = 0; i < VL.size(); ++i) {
  1597. const Instruction *I = cast<Instruction>(VL[i]);
  1598. if (!isa<ConstantInt>(I->getOperand(1))) {
  1599. Op2VK = TargetTransformInfo::OK_AnyValue;
  1600. break;
  1601. }
  1602. if (i == 0) {
  1603. CInt = cast<ConstantInt>(I->getOperand(1));
  1604. continue;
  1605. }
  1606. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
  1607. CInt != cast<ConstantInt>(I->getOperand(1)))
  1608. Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
  1609. }
  1610. // FIXME: Currently cost of model modification for division by power of
  1611. // 2 is handled for X86 and AArch64. Add support for other targets.
  1612. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
  1613. CInt->getValue().isPowerOf2())
  1614. Op2VP = TargetTransformInfo::OP_PowerOf2;
  1615. SmallVector<const Value *, 4> Operands(VL0->operand_values());
  1616. int ScalarCost =
  1617. VecTy->getNumElements() *
  1618. TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP,
  1619. Op2VP, Operands);
  1620. int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
  1621. Op1VP, Op2VP, Operands);
  1622. return VecCost - ScalarCost;
  1623. }
  1624. case Instruction::GetElementPtr: {
  1625. TargetTransformInfo::OperandValueKind Op1VK =
  1626. TargetTransformInfo::OK_AnyValue;
  1627. TargetTransformInfo::OperandValueKind Op2VK =
  1628. TargetTransformInfo::OK_UniformConstantValue;
  1629. int ScalarCost =
  1630. VecTy->getNumElements() *
  1631. TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
  1632. int VecCost =
  1633. TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
  1634. return VecCost - ScalarCost;
  1635. }
  1636. case Instruction::Load: {
  1637. // Cost of wide load - cost of scalar loads.
  1638. unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
  1639. int ScalarLdCost = VecTy->getNumElements() *
  1640. TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
  1641. int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
  1642. VecTy, alignment, 0, VL0);
  1643. return VecLdCost - ScalarLdCost;
  1644. }
  1645. case Instruction::Store: {
  1646. // We know that we can merge the stores. Calculate the cost.
  1647. unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
  1648. int ScalarStCost = VecTy->getNumElements() *
  1649. TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
  1650. int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
  1651. VecTy, alignment, 0, VL0);
  1652. return VecStCost - ScalarStCost;
  1653. }
  1654. case Instruction::Call: {
  1655. CallInst *CI = cast<CallInst>(VL0);
  1656. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  1657. // Calculate the cost of the scalar and vector calls.
  1658. SmallVector<Type*, 4> ScalarTys;
  1659. for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
  1660. ScalarTys.push_back(CI->getArgOperand(op)->getType());
  1661. FastMathFlags FMF;
  1662. if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
  1663. FMF = FPMO->getFastMathFlags();
  1664. int ScalarCallCost = VecTy->getNumElements() *
  1665. TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
  1666. SmallVector<Value *, 4> Args(CI->arg_operands());
  1667. int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
  1668. VecTy->getNumElements());
  1669. DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
  1670. << " (" << VecCallCost << "-" << ScalarCallCost << ")"
  1671. << " for " << *CI << "\n");
  1672. return VecCallCost - ScalarCallCost;
  1673. }
  1674. case Instruction::ShuffleVector: {
  1675. TargetTransformInfo::OperandValueKind Op1VK =
  1676. TargetTransformInfo::OK_AnyValue;
  1677. TargetTransformInfo::OperandValueKind Op2VK =
  1678. TargetTransformInfo::OK_AnyValue;
  1679. int ScalarCost = 0;
  1680. int VecCost = 0;
  1681. for (Value *i : VL) {
  1682. Instruction *I = cast<Instruction>(i);
  1683. if (!I)
  1684. break;
  1685. ScalarCost +=
  1686. TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
  1687. }
  1688. // VecCost is equal to sum of the cost of creating 2 vectors
  1689. // and the cost of creating shuffle.
  1690. Instruction *I0 = cast<Instruction>(VL[0]);
  1691. VecCost =
  1692. TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
  1693. Instruction *I1 = cast<Instruction>(VL[1]);
  1694. VecCost +=
  1695. TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
  1696. VecCost +=
  1697. TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
  1698. return VecCost - ScalarCost;
  1699. }
  1700. default:
  1701. llvm_unreachable("Unknown instruction");
  1702. }
  1703. }
  1704. bool BoUpSLP::isFullyVectorizableTinyTree() {
  1705. DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
  1706. VectorizableTree.size() << " is fully vectorizable .\n");
  1707. // We only handle trees of heights 1 and 2.
  1708. if (VectorizableTree.size() == 1 && !VectorizableTree[0].NeedToGather)
  1709. return true;
  1710. if (VectorizableTree.size() != 2)
  1711. return false;
  1712. // Handle splat and all-constants stores.
  1713. if (!VectorizableTree[0].NeedToGather &&
  1714. (allConstant(VectorizableTree[1].Scalars) ||
  1715. isSplat(VectorizableTree[1].Scalars)))
  1716. return true;
  1717. // Gathering cost would be too much for tiny trees.
  1718. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
  1719. return false;
  1720. return true;
  1721. }
  1722. bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() {
  1723. // We can vectorize the tree if its size is greater than or equal to the
  1724. // minimum size specified by the MinTreeSize command line option.
  1725. if (VectorizableTree.size() >= MinTreeSize)
  1726. return false;
  1727. // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
  1728. // can vectorize it if we can prove it fully vectorizable.
  1729. if (isFullyVectorizableTinyTree())
  1730. return false;
  1731. assert(VectorizableTree.empty()
  1732. ? ExternalUses.empty()
  1733. : true && "We shouldn't have any external users");
  1734. // Otherwise, we can't vectorize the tree. It is both tiny and not fully
  1735. // vectorizable.
  1736. return true;
  1737. }
  1738. int BoUpSLP::getSpillCost() {
  1739. // Walk from the bottom of the tree to the top, tracking which values are
  1740. // live. When we see a call instruction that is not part of our tree,
  1741. // query TTI to see if there is a cost to keeping values live over it
  1742. // (for example, if spills and fills are required).
  1743. unsigned BundleWidth = VectorizableTree.front().Scalars.size();
  1744. int Cost = 0;
  1745. SmallPtrSet<Instruction*, 4> LiveValues;
  1746. Instruction *PrevInst = nullptr;
  1747. for (const auto &N : VectorizableTree) {
  1748. Instruction *Inst = dyn_cast<Instruction>(N.Scalars[0]);
  1749. if (!Inst)
  1750. continue;
  1751. if (!PrevInst) {
  1752. PrevInst = Inst;
  1753. continue;
  1754. }
  1755. // Update LiveValues.
  1756. LiveValues.erase(PrevInst);
  1757. for (auto &J : PrevInst->operands()) {
  1758. if (isa<Instruction>(&*J) && getTreeEntry(&*J))
  1759. LiveValues.insert(cast<Instruction>(&*J));
  1760. }
  1761. DEBUG(
  1762. dbgs() << "SLP: #LV: " << LiveValues.size();
  1763. for (auto *X : LiveValues)
  1764. dbgs() << " " << X->getName();
  1765. dbgs() << ", Looking at ";
  1766. Inst->dump();
  1767. );
  1768. // Now find the sequence of instructions between PrevInst and Inst.
  1769. BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
  1770. PrevInstIt =
  1771. PrevInst->getIterator().getReverse();
  1772. while (InstIt != PrevInstIt) {
  1773. if (PrevInstIt == PrevInst->getParent()->rend()) {
  1774. PrevInstIt = Inst->getParent()->rbegin();
  1775. continue;
  1776. }
  1777. if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
  1778. SmallVector<Type*, 4> V;
  1779. for (auto *II : LiveValues)
  1780. V.push_back(VectorType::get(II->getType(), BundleWidth));
  1781. Cost += TTI->getCostOfKeepingLiveOverCall(V);
  1782. }
  1783. ++PrevInstIt;
  1784. }
  1785. PrevInst = Inst;
  1786. }
  1787. return Cost;
  1788. }
  1789. int BoUpSLP::getTreeCost() {
  1790. int Cost = 0;
  1791. DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
  1792. VectorizableTree.size() << ".\n");
  1793. unsigned BundleWidth = VectorizableTree[0].Scalars.size();
  1794. for (TreeEntry &TE : VectorizableTree) {
  1795. int C = getEntryCost(&TE);
  1796. DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
  1797. << *TE.Scalars[0] << ".\n");
  1798. Cost += C;
  1799. }
  1800. SmallSet<Value *, 16> ExtractCostCalculated;
  1801. int ExtractCost = 0;
  1802. for (ExternalUser &EU : ExternalUses) {
  1803. // We only add extract cost once for the same scalar.
  1804. if (!ExtractCostCalculated.insert(EU.Scalar).second)
  1805. continue;
  1806. // Uses by ephemeral values are free (because the ephemeral value will be
  1807. // removed prior to code generation, and so the extraction will be
  1808. // removed as well).
  1809. if (EphValues.count(EU.User))
  1810. continue;
  1811. // If we plan to rewrite the tree in a smaller type, we will need to sign
  1812. // extend the extracted value back to the original type. Here, we account
  1813. // for the extract and the added cost of the sign extend if needed.
  1814. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
  1815. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  1816. if (MinBWs.count(ScalarRoot)) {
  1817. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
  1818. auto Extend =
  1819. MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
  1820. VecTy = VectorType::get(MinTy, BundleWidth);
  1821. ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
  1822. VecTy, EU.Lane);
  1823. } else {
  1824. ExtractCost +=
  1825. TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
  1826. }
  1827. }
  1828. int SpillCost = getSpillCost();
  1829. Cost += SpillCost + ExtractCost;
  1830. std::string Str;
  1831. {
  1832. raw_string_ostream OS(Str);
  1833. OS << "SLP: Spill Cost = " << SpillCost << ".\n"
  1834. << "SLP: Extract Cost = " << ExtractCost << ".\n"
  1835. << "SLP: Total Cost = " << Cost << ".\n";
  1836. }
  1837. DEBUG(dbgs() << Str);
  1838. if (ViewSLPTree)
  1839. ViewGraph(this, "SLP" + F->getName(), false, Str);
  1840. return Cost;
  1841. }
  1842. int BoUpSLP::getGatherCost(Type *Ty) {
  1843. int Cost = 0;
  1844. for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
  1845. Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  1846. return Cost;
  1847. }
  1848. int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
  1849. // Find the type of the operands in VL.
  1850. Type *ScalarTy = VL[0]->getType();
  1851. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1852. ScalarTy = SI->getValueOperand()->getType();
  1853. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1854. // Find the cost of inserting/extracting values from the vector.
  1855. return getGatherCost(VecTy);
  1856. }
  1857. // Reorder commutative operations in alternate shuffle if the resulting vectors
  1858. // are consecutive loads. This would allow us to vectorize the tree.
  1859. // If we have something like-
  1860. // load a[0] - load b[0]
  1861. // load b[1] + load a[1]
  1862. // load a[2] - load b[2]
  1863. // load a[3] + load b[3]
  1864. // Reordering the second load b[1] load a[1] would allow us to vectorize this
  1865. // code.
  1866. void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
  1867. SmallVectorImpl<Value *> &Left,
  1868. SmallVectorImpl<Value *> &Right) {
  1869. // Push left and right operands of binary operation into Left and Right
  1870. for (Value *i : VL) {
  1871. Left.push_back(cast<Instruction>(i)->getOperand(0));
  1872. Right.push_back(cast<Instruction>(i)->getOperand(1));
  1873. }
  1874. // Reorder if we have a commutative operation and consecutive access
  1875. // are on either side of the alternate instructions.
  1876. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  1877. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  1878. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  1879. Instruction *VL1 = cast<Instruction>(VL[j]);
  1880. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1881. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  1882. std::swap(Left[j], Right[j]);
  1883. continue;
  1884. } else if (VL2->isCommutative() &&
  1885. isConsecutiveAccess(L, L1, *DL, *SE)) {
  1886. std::swap(Left[j + 1], Right[j + 1]);
  1887. continue;
  1888. }
  1889. // else unchanged
  1890. }
  1891. }
  1892. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  1893. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  1894. Instruction *VL1 = cast<Instruction>(VL[j]);
  1895. Instruction *VL2 = cast<Instruction>(VL[j + 1]);
  1896. if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
  1897. std::swap(Left[j], Right[j]);
  1898. continue;
  1899. } else if (VL2->isCommutative() &&
  1900. isConsecutiveAccess(L, L1, *DL, *SE)) {
  1901. std::swap(Left[j + 1], Right[j + 1]);
  1902. continue;
  1903. }
  1904. // else unchanged
  1905. }
  1906. }
  1907. }
  1908. }
  1909. // Return true if I should be commuted before adding it's left and right
  1910. // operands to the arrays Left and Right.
  1911. //
  1912. // The vectorizer is trying to either have all elements one side being
  1913. // instruction with the same opcode to enable further vectorization, or having
  1914. // a splat to lower the vectorizing cost.
  1915. static bool shouldReorderOperands(int i, Instruction &I,
  1916. SmallVectorImpl<Value *> &Left,
  1917. SmallVectorImpl<Value *> &Right,
  1918. bool AllSameOpcodeLeft,
  1919. bool AllSameOpcodeRight, bool SplatLeft,
  1920. bool SplatRight) {
  1921. Value *VLeft = I.getOperand(0);
  1922. Value *VRight = I.getOperand(1);
  1923. // If we have "SplatRight", try to see if commuting is needed to preserve it.
  1924. if (SplatRight) {
  1925. if (VRight == Right[i - 1])
  1926. // Preserve SplatRight
  1927. return false;
  1928. if (VLeft == Right[i - 1]) {
  1929. // Commuting would preserve SplatRight, but we don't want to break
  1930. // SplatLeft either, i.e. preserve the original order if possible.
  1931. // (FIXME: why do we care?)
  1932. if (SplatLeft && VLeft == Left[i - 1])
  1933. return false;
  1934. return true;
  1935. }
  1936. }
  1937. // Symmetrically handle Right side.
  1938. if (SplatLeft) {
  1939. if (VLeft == Left[i - 1])
  1940. // Preserve SplatLeft
  1941. return false;
  1942. if (VRight == Left[i - 1])
  1943. return true;
  1944. }
  1945. Instruction *ILeft = dyn_cast<Instruction>(VLeft);
  1946. Instruction *IRight = dyn_cast<Instruction>(VRight);
  1947. // If we have "AllSameOpcodeRight", try to see if the left operands preserves
  1948. // it and not the right, in this case we want to commute.
  1949. if (AllSameOpcodeRight) {
  1950. unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
  1951. if (IRight && RightPrevOpcode == IRight->getOpcode())
  1952. // Do not commute, a match on the right preserves AllSameOpcodeRight
  1953. return false;
  1954. if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
  1955. // We have a match and may want to commute, but first check if there is
  1956. // not also a match on the existing operands on the Left to preserve
  1957. // AllSameOpcodeLeft, i.e. preserve the original order if possible.
  1958. // (FIXME: why do we care?)
  1959. if (AllSameOpcodeLeft && ILeft &&
  1960. cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
  1961. return false;
  1962. return true;
  1963. }
  1964. }
  1965. // Symmetrically handle Left side.
  1966. if (AllSameOpcodeLeft) {
  1967. unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
  1968. if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
  1969. return false;
  1970. if (IRight && LeftPrevOpcode == IRight->getOpcode())
  1971. return true;
  1972. }
  1973. return false;
  1974. }
  1975. void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  1976. SmallVectorImpl<Value *> &Left,
  1977. SmallVectorImpl<Value *> &Right) {
  1978. if (VL.size()) {
  1979. // Peel the first iteration out of the loop since there's nothing
  1980. // interesting to do anyway and it simplifies the checks in the loop.
  1981. auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
  1982. auto VRight = cast<Instruction>(VL[0])->getOperand(1);
  1983. if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
  1984. // Favor having instruction to the right. FIXME: why?
  1985. std::swap(VLeft, VRight);
  1986. Left.push_back(VLeft);
  1987. Right.push_back(VRight);
  1988. }
  1989. // Keep track if we have instructions with all the same opcode on one side.
  1990. bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
  1991. bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
  1992. // Keep track if we have one side with all the same value (broadcast).
  1993. bool SplatLeft = true;
  1994. bool SplatRight = true;
  1995. for (unsigned i = 1, e = VL.size(); i != e; ++i) {
  1996. Instruction *I = cast<Instruction>(VL[i]);
  1997. assert(I->isCommutative() && "Can only process commutative instruction");
  1998. // Commute to favor either a splat or maximizing having the same opcodes on
  1999. // one side.
  2000. if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
  2001. AllSameOpcodeRight, SplatLeft, SplatRight)) {
  2002. Left.push_back(I->getOperand(1));
  2003. Right.push_back(I->getOperand(0));
  2004. } else {
  2005. Left.push_back(I->getOperand(0));
  2006. Right.push_back(I->getOperand(1));
  2007. }
  2008. // Update Splat* and AllSameOpcode* after the insertion.
  2009. SplatRight = SplatRight && (Right[i - 1] == Right[i]);
  2010. SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
  2011. AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
  2012. (cast<Instruction>(Left[i - 1])->getOpcode() ==
  2013. cast<Instruction>(Left[i])->getOpcode());
  2014. AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
  2015. (cast<Instruction>(Right[i - 1])->getOpcode() ==
  2016. cast<Instruction>(Right[i])->getOpcode());
  2017. }
  2018. // If one operand end up being broadcast, return this operand order.
  2019. if (SplatRight || SplatLeft)
  2020. return;
  2021. // Finally check if we can get longer vectorizable chain by reordering
  2022. // without breaking the good operand order detected above.
  2023. // E.g. If we have something like-
  2024. // load a[0] load b[0]
  2025. // load b[1] load a[1]
  2026. // load a[2] load b[2]
  2027. // load a[3] load b[3]
  2028. // Reordering the second load b[1] load a[1] would allow us to vectorize
  2029. // this code and we still retain AllSameOpcode property.
  2030. // FIXME: This load reordering might break AllSameOpcode in some rare cases
  2031. // such as-
  2032. // add a[0],c[0] load b[0]
  2033. // add a[1],c[2] load b[1]
  2034. // b[2] load b[2]
  2035. // add a[3],c[3] load b[3]
  2036. for (unsigned j = 0; j < VL.size() - 1; ++j) {
  2037. if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
  2038. if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
  2039. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  2040. std::swap(Left[j + 1], Right[j + 1]);
  2041. continue;
  2042. }
  2043. }
  2044. }
  2045. if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
  2046. if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
  2047. if (isConsecutiveAccess(L, L1, *DL, *SE)) {
  2048. std::swap(Left[j + 1], Right[j + 1]);
  2049. continue;
  2050. }
  2051. }
  2052. }
  2053. // else unchanged
  2054. }
  2055. }
  2056. void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
  2057. // Get the basic block this bundle is in. All instructions in the bundle
  2058. // should be in this block.
  2059. auto *Front = cast<Instruction>(VL.front());
  2060. auto *BB = Front->getParent();
  2061. assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool {
  2062. return cast<Instruction>(V)->getParent() == BB;
  2063. }));
  2064. // The last instruction in the bundle in program order.
  2065. Instruction *LastInst = nullptr;
  2066. // Find the last instruction. The common case should be that BB has been
  2067. // scheduled, and the last instruction is VL.back(). So we start with
  2068. // VL.back() and iterate over schedule data until we reach the end of the
  2069. // bundle. The end of the bundle is marked by null ScheduleData.
  2070. if (BlocksSchedules.count(BB)) {
  2071. auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back());
  2072. if (Bundle && Bundle->isPartOfBundle())
  2073. for (; Bundle; Bundle = Bundle->NextInBundle)
  2074. LastInst = Bundle->Inst;
  2075. }
  2076. // LastInst can still be null at this point if there's either not an entry
  2077. // for BB in BlocksSchedules or there's no ScheduleData available for
  2078. // VL.back(). This can be the case if buildTree_rec aborts for various
  2079. // reasons (e.g., the maximum recursion depth is reached, the maximum region
  2080. // size is reached, etc.). ScheduleData is initialized in the scheduling
  2081. // "dry-run".
  2082. //
  2083. // If this happens, we can still find the last instruction by brute force. We
  2084. // iterate forwards from Front (inclusive) until we either see all
  2085. // instructions in the bundle or reach the end of the block. If Front is the
  2086. // last instruction in program order, LastInst will be set to Front, and we
  2087. // will visit all the remaining instructions in the block.
  2088. //
  2089. // One of the reasons we exit early from buildTree_rec is to place an upper
  2090. // bound on compile-time. Thus, taking an additional compile-time hit here is
  2091. // not ideal. However, this should be exceedingly rare since it requires that
  2092. // we both exit early from buildTree_rec and that the bundle be out-of-order
  2093. // (causing us to iterate all the way to the end of the block).
  2094. if (!LastInst) {
  2095. SmallPtrSet<Value *, 16> Bundle(VL.begin(), VL.end());
  2096. for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
  2097. if (Bundle.erase(&I))
  2098. LastInst = &I;
  2099. if (Bundle.empty())
  2100. break;
  2101. }
  2102. }
  2103. // Set the insertion point after the last instruction in the bundle. Set the
  2104. // debug location to Front.
  2105. Builder.SetInsertPoint(BB, ++LastInst->getIterator());
  2106. Builder.SetCurrentDebugLocation(Front->getDebugLoc());
  2107. }
  2108. Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
  2109. Value *Vec = UndefValue::get(Ty);
  2110. // Generate the 'InsertElement' instruction.
  2111. for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
  2112. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
  2113. if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
  2114. GatherSeq.insert(Insrt);
  2115. CSEBlocks.insert(Insrt->getParent());
  2116. // Add to our 'need-to-extract' list.
  2117. if (TreeEntry *E = getTreeEntry(VL[i])) {
  2118. // Find which lane we need to extract.
  2119. int FoundLane = -1;
  2120. for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
  2121. // Is this the lane of the scalar that we are looking for ?
  2122. if (E->Scalars[Lane] == VL[i]) {
  2123. FoundLane = Lane;
  2124. break;
  2125. }
  2126. }
  2127. assert(FoundLane >= 0 && "Could not find the correct lane");
  2128. ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
  2129. }
  2130. }
  2131. }
  2132. return Vec;
  2133. }
  2134. Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL, Value *OpValue) const {
  2135. if (const TreeEntry *En = getTreeEntry(OpValue)) {
  2136. if (En->isSame(VL) && En->VectorizedValue)
  2137. return En->VectorizedValue;
  2138. }
  2139. return nullptr;
  2140. }
  2141. Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  2142. if (TreeEntry *E = getTreeEntry(VL[0]))
  2143. if (E->isSame(VL))
  2144. return vectorizeTree(E);
  2145. Type *ScalarTy = VL[0]->getType();
  2146. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  2147. ScalarTy = SI->getValueOperand()->getType();
  2148. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  2149. return Gather(VL, VecTy);
  2150. }
  2151. Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
  2152. IRBuilder<>::InsertPointGuard Guard(Builder);
  2153. if (E->VectorizedValue) {
  2154. DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
  2155. return E->VectorizedValue;
  2156. }
  2157. Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
  2158. Type *ScalarTy = VL0->getType();
  2159. if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
  2160. ScalarTy = SI->getValueOperand()->getType();
  2161. VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
  2162. if (E->NeedToGather) {
  2163. setInsertPointAfterBundle(E->Scalars);
  2164. auto *V = Gather(E->Scalars, VecTy);
  2165. E->VectorizedValue = V;
  2166. return V;
  2167. }
  2168. unsigned Opcode = getSameOpcode(E->Scalars);
  2169. switch (Opcode) {
  2170. case Instruction::PHI: {
  2171. PHINode *PH = dyn_cast<PHINode>(VL0);
  2172. Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
  2173. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  2174. PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
  2175. E->VectorizedValue = NewPhi;
  2176. // PHINodes may have multiple entries from the same block. We want to
  2177. // visit every block once.
  2178. SmallSet<BasicBlock*, 4> VisitedBBs;
  2179. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  2180. ValueList Operands;
  2181. BasicBlock *IBB = PH->getIncomingBlock(i);
  2182. if (!VisitedBBs.insert(IBB).second) {
  2183. NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
  2184. continue;
  2185. }
  2186. // Prepare the operand vector.
  2187. for (Value *V : E->Scalars)
  2188. Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
  2189. Builder.SetInsertPoint(IBB->getTerminator());
  2190. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  2191. Value *Vec = vectorizeTree(Operands);
  2192. NewPhi->addIncoming(Vec, IBB);
  2193. }
  2194. assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
  2195. "Invalid number of incoming values");
  2196. return NewPhi;
  2197. }
  2198. case Instruction::ExtractElement: {
  2199. if (canReuseExtract(E->Scalars, VL0)) {
  2200. Value *V = VL0->getOperand(0);
  2201. E->VectorizedValue = V;
  2202. return V;
  2203. }
  2204. setInsertPointAfterBundle(E->Scalars);
  2205. auto *V = Gather(E->Scalars, VecTy);
  2206. E->VectorizedValue = V;
  2207. return V;
  2208. }
  2209. case Instruction::ExtractValue: {
  2210. if (canReuseExtract(E->Scalars, VL0)) {
  2211. LoadInst *LI = cast<LoadInst>(VL0->getOperand(0));
  2212. Builder.SetInsertPoint(LI);
  2213. PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
  2214. Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
  2215. LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment());
  2216. E->VectorizedValue = V;
  2217. return propagateMetadata(V, E->Scalars);
  2218. }
  2219. setInsertPointAfterBundle(E->Scalars);
  2220. auto *V = Gather(E->Scalars, VecTy);
  2221. E->VectorizedValue = V;
  2222. return V;
  2223. }
  2224. case Instruction::ZExt:
  2225. case Instruction::SExt:
  2226. case Instruction::FPToUI:
  2227. case Instruction::FPToSI:
  2228. case Instruction::FPExt:
  2229. case Instruction::PtrToInt:
  2230. case Instruction::IntToPtr:
  2231. case Instruction::SIToFP:
  2232. case Instruction::UIToFP:
  2233. case Instruction::Trunc:
  2234. case Instruction::FPTrunc:
  2235. case Instruction::BitCast: {
  2236. ValueList INVL;
  2237. for (Value *V : E->Scalars)
  2238. INVL.push_back(cast<Instruction>(V)->getOperand(0));
  2239. setInsertPointAfterBundle(E->Scalars);
  2240. Value *InVec = vectorizeTree(INVL);
  2241. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2242. return V;
  2243. CastInst *CI = dyn_cast<CastInst>(VL0);
  2244. Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
  2245. E->VectorizedValue = V;
  2246. ++NumVectorInstructions;
  2247. return V;
  2248. }
  2249. case Instruction::FCmp:
  2250. case Instruction::ICmp: {
  2251. ValueList LHSV, RHSV;
  2252. for (Value *V : E->Scalars) {
  2253. LHSV.push_back(cast<Instruction>(V)->getOperand(0));
  2254. RHSV.push_back(cast<Instruction>(V)->getOperand(1));
  2255. }
  2256. setInsertPointAfterBundle(E->Scalars);
  2257. Value *L = vectorizeTree(LHSV);
  2258. Value *R = vectorizeTree(RHSV);
  2259. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2260. return V;
  2261. CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
  2262. Value *V;
  2263. if (Opcode == Instruction::FCmp)
  2264. V = Builder.CreateFCmp(P0, L, R);
  2265. else
  2266. V = Builder.CreateICmp(P0, L, R);
  2267. E->VectorizedValue = V;
  2268. propagateIRFlags(E->VectorizedValue, E->Scalars);
  2269. ++NumVectorInstructions;
  2270. return V;
  2271. }
  2272. case Instruction::Select: {
  2273. ValueList TrueVec, FalseVec, CondVec;
  2274. for (Value *V : E->Scalars) {
  2275. CondVec.push_back(cast<Instruction>(V)->getOperand(0));
  2276. TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
  2277. FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
  2278. }
  2279. setInsertPointAfterBundle(E->Scalars);
  2280. Value *Cond = vectorizeTree(CondVec);
  2281. Value *True = vectorizeTree(TrueVec);
  2282. Value *False = vectorizeTree(FalseVec);
  2283. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2284. return V;
  2285. Value *V = Builder.CreateSelect(Cond, True, False);
  2286. E->VectorizedValue = V;
  2287. ++NumVectorInstructions;
  2288. return V;
  2289. }
  2290. case Instruction::Add:
  2291. case Instruction::FAdd:
  2292. case Instruction::Sub:
  2293. case Instruction::FSub:
  2294. case Instruction::Mul:
  2295. case Instruction::FMul:
  2296. case Instruction::UDiv:
  2297. case Instruction::SDiv:
  2298. case Instruction::FDiv:
  2299. case Instruction::URem:
  2300. case Instruction::SRem:
  2301. case Instruction::FRem:
  2302. case Instruction::Shl:
  2303. case Instruction::LShr:
  2304. case Instruction::AShr:
  2305. case Instruction::And:
  2306. case Instruction::Or:
  2307. case Instruction::Xor: {
  2308. ValueList LHSVL, RHSVL;
  2309. if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
  2310. reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
  2311. else
  2312. for (Value *V : E->Scalars) {
  2313. LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
  2314. RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
  2315. }
  2316. setInsertPointAfterBundle(E->Scalars);
  2317. Value *LHS = vectorizeTree(LHSVL);
  2318. Value *RHS = vectorizeTree(RHSVL);
  2319. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2320. return V;
  2321. BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
  2322. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
  2323. E->VectorizedValue = V;
  2324. propagateIRFlags(E->VectorizedValue, E->Scalars);
  2325. ++NumVectorInstructions;
  2326. if (Instruction *I = dyn_cast<Instruction>(V))
  2327. return propagateMetadata(I, E->Scalars);
  2328. return V;
  2329. }
  2330. case Instruction::Load: {
  2331. // Loads are inserted at the head of the tree because we don't want to
  2332. // sink them all the way down past store instructions.
  2333. setInsertPointAfterBundle(E->Scalars);
  2334. LoadInst *LI = cast<LoadInst>(VL0);
  2335. Type *ScalarLoadTy = LI->getType();
  2336. unsigned AS = LI->getPointerAddressSpace();
  2337. Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
  2338. VecTy->getPointerTo(AS));
  2339. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2340. // ExternalUses list to make sure that an extract will be generated in the
  2341. // future.
  2342. Value *PO = LI->getPointerOperand();
  2343. if (getTreeEntry(PO))
  2344. ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
  2345. unsigned Alignment = LI->getAlignment();
  2346. LI = Builder.CreateLoad(VecPtr);
  2347. if (!Alignment) {
  2348. Alignment = DL->getABITypeAlignment(ScalarLoadTy);
  2349. }
  2350. LI->setAlignment(Alignment);
  2351. E->VectorizedValue = LI;
  2352. ++NumVectorInstructions;
  2353. return propagateMetadata(LI, E->Scalars);
  2354. }
  2355. case Instruction::Store: {
  2356. StoreInst *SI = cast<StoreInst>(VL0);
  2357. unsigned Alignment = SI->getAlignment();
  2358. unsigned AS = SI->getPointerAddressSpace();
  2359. ValueList ValueOp;
  2360. for (Value *V : E->Scalars)
  2361. ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
  2362. setInsertPointAfterBundle(E->Scalars);
  2363. Value *VecValue = vectorizeTree(ValueOp);
  2364. Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
  2365. VecTy->getPointerTo(AS));
  2366. StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
  2367. // The pointer operand uses an in-tree scalar so we add the new BitCast to
  2368. // ExternalUses list to make sure that an extract will be generated in the
  2369. // future.
  2370. Value *PO = SI->getPointerOperand();
  2371. if (getTreeEntry(PO))
  2372. ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
  2373. if (!Alignment) {
  2374. Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
  2375. }
  2376. S->setAlignment(Alignment);
  2377. E->VectorizedValue = S;
  2378. ++NumVectorInstructions;
  2379. return propagateMetadata(S, E->Scalars);
  2380. }
  2381. case Instruction::GetElementPtr: {
  2382. setInsertPointAfterBundle(E->Scalars);
  2383. ValueList Op0VL;
  2384. for (Value *V : E->Scalars)
  2385. Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
  2386. Value *Op0 = vectorizeTree(Op0VL);
  2387. std::vector<Value *> OpVecs;
  2388. for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
  2389. ++j) {
  2390. ValueList OpVL;
  2391. for (Value *V : E->Scalars)
  2392. OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
  2393. Value *OpVec = vectorizeTree(OpVL);
  2394. OpVecs.push_back(OpVec);
  2395. }
  2396. Value *V = Builder.CreateGEP(
  2397. cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
  2398. E->VectorizedValue = V;
  2399. ++NumVectorInstructions;
  2400. if (Instruction *I = dyn_cast<Instruction>(V))
  2401. return propagateMetadata(I, E->Scalars);
  2402. return V;
  2403. }
  2404. case Instruction::Call: {
  2405. CallInst *CI = cast<CallInst>(VL0);
  2406. setInsertPointAfterBundle(VL0);
  2407. Function *FI;
  2408. Intrinsic::ID IID = Intrinsic::not_intrinsic;
  2409. Value *ScalarArg = nullptr;
  2410. if (CI && (FI = CI->getCalledFunction())) {
  2411. IID = FI->getIntrinsicID();
  2412. }
  2413. std::vector<Value *> OpVecs;
  2414. for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
  2415. ValueList OpVL;
  2416. // ctlz,cttz and powi are special intrinsics whose second argument is
  2417. // a scalar. This argument should not be vectorized.
  2418. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
  2419. CallInst *CEI = cast<CallInst>(E->Scalars[0]);
  2420. ScalarArg = CEI->getArgOperand(j);
  2421. OpVecs.push_back(CEI->getArgOperand(j));
  2422. continue;
  2423. }
  2424. for (Value *V : E->Scalars) {
  2425. CallInst *CEI = cast<CallInst>(V);
  2426. OpVL.push_back(CEI->getArgOperand(j));
  2427. }
  2428. Value *OpVec = vectorizeTree(OpVL);
  2429. DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
  2430. OpVecs.push_back(OpVec);
  2431. }
  2432. Module *M = F->getParent();
  2433. Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
  2434. Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
  2435. Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
  2436. SmallVector<OperandBundleDef, 1> OpBundles;
  2437. CI->getOperandBundlesAsDefs(OpBundles);
  2438. Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
  2439. // The scalar argument uses an in-tree scalar so we add the new vectorized
  2440. // call to ExternalUses list to make sure that an extract will be
  2441. // generated in the future.
  2442. if (ScalarArg && getTreeEntry(ScalarArg))
  2443. ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
  2444. E->VectorizedValue = V;
  2445. propagateIRFlags(E->VectorizedValue, E->Scalars);
  2446. ++NumVectorInstructions;
  2447. return V;
  2448. }
  2449. case Instruction::ShuffleVector: {
  2450. ValueList LHSVL, RHSVL;
  2451. assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
  2452. reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
  2453. setInsertPointAfterBundle(E->Scalars);
  2454. Value *LHS = vectorizeTree(LHSVL);
  2455. Value *RHS = vectorizeTree(RHSVL);
  2456. if (Value *V = alreadyVectorized(E->Scalars, VL0))
  2457. return V;
  2458. // Create a vector of LHS op1 RHS
  2459. BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
  2460. Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
  2461. // Create a vector of LHS op2 RHS
  2462. Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
  2463. BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
  2464. Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
  2465. // Create shuffle to take alternate operations from the vector.
  2466. // Also, gather up odd and even scalar ops to propagate IR flags to
  2467. // each vector operation.
  2468. ValueList OddScalars, EvenScalars;
  2469. unsigned e = E->Scalars.size();
  2470. SmallVector<Constant *, 8> Mask(e);
  2471. for (unsigned i = 0; i < e; ++i) {
  2472. if (isOdd(i)) {
  2473. Mask[i] = Builder.getInt32(e + i);
  2474. OddScalars.push_back(E->Scalars[i]);
  2475. } else {
  2476. Mask[i] = Builder.getInt32(i);
  2477. EvenScalars.push_back(E->Scalars[i]);
  2478. }
  2479. }
  2480. Value *ShuffleMask = ConstantVector::get(Mask);
  2481. propagateIRFlags(V0, EvenScalars);
  2482. propagateIRFlags(V1, OddScalars);
  2483. Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
  2484. E->VectorizedValue = V;
  2485. ++NumVectorInstructions;
  2486. if (Instruction *I = dyn_cast<Instruction>(V))
  2487. return propagateMetadata(I, E->Scalars);
  2488. return V;
  2489. }
  2490. default:
  2491. llvm_unreachable("unknown inst");
  2492. }
  2493. return nullptr;
  2494. }
  2495. Value *BoUpSLP::vectorizeTree() {
  2496. ExtraValueToDebugLocsMap ExternallyUsedValues;
  2497. return vectorizeTree(ExternallyUsedValues);
  2498. }
  2499. Value *
  2500. BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
  2501. // All blocks must be scheduled before any instructions are inserted.
  2502. for (auto &BSIter : BlocksSchedules) {
  2503. scheduleBlock(BSIter.second.get());
  2504. }
  2505. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2506. auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
  2507. // If the vectorized tree can be rewritten in a smaller type, we truncate the
  2508. // vectorized root. InstCombine will then rewrite the entire expression. We
  2509. // sign extend the extracted values below.
  2510. auto *ScalarRoot = VectorizableTree[0].Scalars[0];
  2511. if (MinBWs.count(ScalarRoot)) {
  2512. if (auto *I = dyn_cast<Instruction>(VectorRoot))
  2513. Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
  2514. auto BundleWidth = VectorizableTree[0].Scalars.size();
  2515. auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
  2516. auto *VecTy = VectorType::get(MinTy, BundleWidth);
  2517. auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
  2518. VectorizableTree[0].VectorizedValue = Trunc;
  2519. }
  2520. DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
  2521. // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
  2522. // specified by ScalarType.
  2523. auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
  2524. if (!MinBWs.count(ScalarRoot))
  2525. return Ex;
  2526. if (MinBWs[ScalarRoot].second)
  2527. return Builder.CreateSExt(Ex, ScalarType);
  2528. return Builder.CreateZExt(Ex, ScalarType);
  2529. };
  2530. // Extract all of the elements with the external uses.
  2531. for (const auto &ExternalUse : ExternalUses) {
  2532. Value *Scalar = ExternalUse.Scalar;
  2533. llvm::User *User = ExternalUse.User;
  2534. // Skip users that we already RAUW. This happens when one instruction
  2535. // has multiple uses of the same value.
  2536. if (User && !is_contained(Scalar->users(), User))
  2537. continue;
  2538. TreeEntry *E = getTreeEntry(Scalar);
  2539. assert(E && "Invalid scalar");
  2540. assert(!E->NeedToGather && "Extracting from a gather list");
  2541. Value *Vec = E->VectorizedValue;
  2542. assert(Vec && "Can't find vectorizable value");
  2543. Value *Lane = Builder.getInt32(ExternalUse.Lane);
  2544. // If User == nullptr, the Scalar is used as extra arg. Generate
  2545. // ExtractElement instruction and update the record for this scalar in
  2546. // ExternallyUsedValues.
  2547. if (!User) {
  2548. assert(ExternallyUsedValues.count(Scalar) &&
  2549. "Scalar with nullptr as an external user must be registered in "
  2550. "ExternallyUsedValues map");
  2551. if (auto *VecI = dyn_cast<Instruction>(Vec)) {
  2552. Builder.SetInsertPoint(VecI->getParent(),
  2553. std::next(VecI->getIterator()));
  2554. } else {
  2555. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2556. }
  2557. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2558. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2559. CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
  2560. auto &Locs = ExternallyUsedValues[Scalar];
  2561. ExternallyUsedValues.insert({Ex, Locs});
  2562. ExternallyUsedValues.erase(Scalar);
  2563. continue;
  2564. }
  2565. // Generate extracts for out-of-tree users.
  2566. // Find the insertion point for the extractelement lane.
  2567. if (auto *VecI = dyn_cast<Instruction>(Vec)) {
  2568. if (PHINode *PH = dyn_cast<PHINode>(User)) {
  2569. for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
  2570. if (PH->getIncomingValue(i) == Scalar) {
  2571. TerminatorInst *IncomingTerminator =
  2572. PH->getIncomingBlock(i)->getTerminator();
  2573. if (isa<CatchSwitchInst>(IncomingTerminator)) {
  2574. Builder.SetInsertPoint(VecI->getParent(),
  2575. std::next(VecI->getIterator()));
  2576. } else {
  2577. Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
  2578. }
  2579. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2580. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2581. CSEBlocks.insert(PH->getIncomingBlock(i));
  2582. PH->setOperand(i, Ex);
  2583. }
  2584. }
  2585. } else {
  2586. Builder.SetInsertPoint(cast<Instruction>(User));
  2587. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2588. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2589. CSEBlocks.insert(cast<Instruction>(User)->getParent());
  2590. User->replaceUsesOfWith(Scalar, Ex);
  2591. }
  2592. } else {
  2593. Builder.SetInsertPoint(&F->getEntryBlock().front());
  2594. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  2595. Ex = extend(ScalarRoot, Ex, Scalar->getType());
  2596. CSEBlocks.insert(&F->getEntryBlock());
  2597. User->replaceUsesOfWith(Scalar, Ex);
  2598. }
  2599. DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
  2600. }
  2601. // For each vectorized value:
  2602. for (TreeEntry &EIdx : VectorizableTree) {
  2603. TreeEntry *Entry = &EIdx;
  2604. // For each lane:
  2605. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  2606. Value *Scalar = Entry->Scalars[Lane];
  2607. // No need to handle users of gathered values.
  2608. if (Entry->NeedToGather)
  2609. continue;
  2610. assert(Entry->VectorizedValue && "Can't find vectorizable value");
  2611. Type *Ty = Scalar->getType();
  2612. if (!Ty->isVoidTy()) {
  2613. #ifndef NDEBUG
  2614. for (User *U : Scalar->users()) {
  2615. DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
  2616. // It is legal to replace users in the ignorelist by undef.
  2617. assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
  2618. "Replacing out-of-tree value with undef");
  2619. }
  2620. #endif
  2621. Value *Undef = UndefValue::get(Ty);
  2622. Scalar->replaceAllUsesWith(Undef);
  2623. }
  2624. DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
  2625. eraseInstruction(cast<Instruction>(Scalar));
  2626. }
  2627. }
  2628. Builder.ClearInsertionPoint();
  2629. return VectorizableTree[0].VectorizedValue;
  2630. }
  2631. void BoUpSLP::optimizeGatherSequence() {
  2632. DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
  2633. << " gather sequences instructions.\n");
  2634. // LICM InsertElementInst sequences.
  2635. for (Instruction *it : GatherSeq) {
  2636. InsertElementInst *Insert = dyn_cast<InsertElementInst>(it);
  2637. if (!Insert)
  2638. continue;
  2639. // Check if this block is inside a loop.
  2640. Loop *L = LI->getLoopFor(Insert->getParent());
  2641. if (!L)
  2642. continue;
  2643. // Check if it has a preheader.
  2644. BasicBlock *PreHeader = L->getLoopPreheader();
  2645. if (!PreHeader)
  2646. continue;
  2647. // If the vector or the element that we insert into it are
  2648. // instructions that are defined in this basic block then we can't
  2649. // hoist this instruction.
  2650. Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
  2651. Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
  2652. if (CurrVec && L->contains(CurrVec))
  2653. continue;
  2654. if (NewElem && L->contains(NewElem))
  2655. continue;
  2656. // We can hoist this instruction. Move it to the pre-header.
  2657. Insert->moveBefore(PreHeader->getTerminator());
  2658. }
  2659. // Make a list of all reachable blocks in our CSE queue.
  2660. SmallVector<const DomTreeNode *, 8> CSEWorkList;
  2661. CSEWorkList.reserve(CSEBlocks.size());
  2662. for (BasicBlock *BB : CSEBlocks)
  2663. if (DomTreeNode *N = DT->getNode(BB)) {
  2664. assert(DT->isReachableFromEntry(N));
  2665. CSEWorkList.push_back(N);
  2666. }
  2667. // Sort blocks by domination. This ensures we visit a block after all blocks
  2668. // dominating it are visited.
  2669. std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
  2670. [this](const DomTreeNode *A, const DomTreeNode *B) {
  2671. return DT->properlyDominates(A, B);
  2672. });
  2673. // Perform O(N^2) search over the gather sequences and merge identical
  2674. // instructions. TODO: We can further optimize this scan if we split the
  2675. // instructions into different buckets based on the insert lane.
  2676. SmallVector<Instruction *, 16> Visited;
  2677. for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
  2678. assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
  2679. "Worklist not sorted properly!");
  2680. BasicBlock *BB = (*I)->getBlock();
  2681. // For all instructions in blocks containing gather sequences:
  2682. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
  2683. Instruction *In = &*it++;
  2684. if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
  2685. continue;
  2686. // Check if we can replace this instruction with any of the
  2687. // visited instructions.
  2688. for (Instruction *v : Visited) {
  2689. if (In->isIdenticalTo(v) &&
  2690. DT->dominates(v->getParent(), In->getParent())) {
  2691. In->replaceAllUsesWith(v);
  2692. eraseInstruction(In);
  2693. In = nullptr;
  2694. break;
  2695. }
  2696. }
  2697. if (In) {
  2698. assert(!is_contained(Visited, In));
  2699. Visited.push_back(In);
  2700. }
  2701. }
  2702. }
  2703. CSEBlocks.clear();
  2704. GatherSeq.clear();
  2705. }
  2706. // Groups the instructions to a bundle (which is then a single scheduling entity)
  2707. // and schedules instructions until the bundle gets ready.
  2708. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
  2709. BoUpSLP *SLP, Value *OpValue) {
  2710. if (isa<PHINode>(OpValue))
  2711. return true;
  2712. // Initialize the instruction bundle.
  2713. Instruction *OldScheduleEnd = ScheduleEnd;
  2714. ScheduleData *PrevInBundle = nullptr;
  2715. ScheduleData *Bundle = nullptr;
  2716. bool ReSchedule = false;
  2717. DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n");
  2718. // Make sure that the scheduling region contains all
  2719. // instructions of the bundle.
  2720. for (Value *V : VL) {
  2721. if (!extendSchedulingRegion(V))
  2722. return false;
  2723. }
  2724. for (Value *V : VL) {
  2725. ScheduleData *BundleMember = getScheduleData(V);
  2726. assert(BundleMember &&
  2727. "no ScheduleData for bundle member (maybe not in same basic block)");
  2728. if (BundleMember->IsScheduled) {
  2729. // A bundle member was scheduled as single instruction before and now
  2730. // needs to be scheduled as part of the bundle. We just get rid of the
  2731. // existing schedule.
  2732. DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
  2733. << " was already scheduled\n");
  2734. ReSchedule = true;
  2735. }
  2736. assert(BundleMember->isSchedulingEntity() &&
  2737. "bundle member already part of other bundle");
  2738. if (PrevInBundle) {
  2739. PrevInBundle->NextInBundle = BundleMember;
  2740. } else {
  2741. Bundle = BundleMember;
  2742. }
  2743. BundleMember->UnscheduledDepsInBundle = 0;
  2744. Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
  2745. // Group the instructions to a bundle.
  2746. BundleMember->FirstInBundle = Bundle;
  2747. PrevInBundle = BundleMember;
  2748. }
  2749. if (ScheduleEnd != OldScheduleEnd) {
  2750. // The scheduling region got new instructions at the lower end (or it is a
  2751. // new region for the first bundle). This makes it necessary to
  2752. // recalculate all dependencies.
  2753. // It is seldom that this needs to be done a second time after adding the
  2754. // initial bundle to the region.
  2755. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  2756. ScheduleData *SD = getScheduleData(I);
  2757. SD->clearDependencies();
  2758. }
  2759. ReSchedule = true;
  2760. }
  2761. if (ReSchedule) {
  2762. resetSchedule();
  2763. initialFillReadyList(ReadyInsts);
  2764. }
  2765. DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
  2766. << BB->getName() << "\n");
  2767. calculateDependencies(Bundle, true, SLP);
  2768. // Now try to schedule the new bundle. As soon as the bundle is "ready" it
  2769. // means that there are no cyclic dependencies and we can schedule it.
  2770. // Note that's important that we don't "schedule" the bundle yet (see
  2771. // cancelScheduling).
  2772. while (!Bundle->isReady() && !ReadyInsts.empty()) {
  2773. ScheduleData *pickedSD = ReadyInsts.back();
  2774. ReadyInsts.pop_back();
  2775. if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
  2776. schedule(pickedSD, ReadyInsts);
  2777. }
  2778. }
  2779. if (!Bundle->isReady()) {
  2780. cancelScheduling(VL, OpValue);
  2781. return false;
  2782. }
  2783. return true;
  2784. }
  2785. void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
  2786. Value *OpValue) {
  2787. if (isa<PHINode>(OpValue))
  2788. return;
  2789. ScheduleData *Bundle = getScheduleData(OpValue);
  2790. DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
  2791. assert(!Bundle->IsScheduled &&
  2792. "Can't cancel bundle which is already scheduled");
  2793. assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
  2794. "tried to unbundle something which is not a bundle");
  2795. // Un-bundle: make single instructions out of the bundle.
  2796. ScheduleData *BundleMember = Bundle;
  2797. while (BundleMember) {
  2798. assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
  2799. BundleMember->FirstInBundle = BundleMember;
  2800. ScheduleData *Next = BundleMember->NextInBundle;
  2801. BundleMember->NextInBundle = nullptr;
  2802. BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
  2803. if (BundleMember->UnscheduledDepsInBundle == 0) {
  2804. ReadyInsts.insert(BundleMember);
  2805. }
  2806. BundleMember = Next;
  2807. }
  2808. }
  2809. bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
  2810. if (getScheduleData(V))
  2811. return true;
  2812. Instruction *I = dyn_cast<Instruction>(V);
  2813. assert(I && "bundle member must be an instruction");
  2814. assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
  2815. if (!ScheduleStart) {
  2816. // It's the first instruction in the new region.
  2817. initScheduleData(I, I->getNextNode(), nullptr, nullptr);
  2818. ScheduleStart = I;
  2819. ScheduleEnd = I->getNextNode();
  2820. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2821. DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
  2822. return true;
  2823. }
  2824. // Search up and down at the same time, because we don't know if the new
  2825. // instruction is above or below the existing scheduling region.
  2826. BasicBlock::reverse_iterator UpIter =
  2827. ++ScheduleStart->getIterator().getReverse();
  2828. BasicBlock::reverse_iterator UpperEnd = BB->rend();
  2829. BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
  2830. BasicBlock::iterator LowerEnd = BB->end();
  2831. for (;;) {
  2832. if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
  2833. DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
  2834. return false;
  2835. }
  2836. if (UpIter != UpperEnd) {
  2837. if (&*UpIter == I) {
  2838. initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
  2839. ScheduleStart = I;
  2840. DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
  2841. return true;
  2842. }
  2843. UpIter++;
  2844. }
  2845. if (DownIter != LowerEnd) {
  2846. if (&*DownIter == I) {
  2847. initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
  2848. nullptr);
  2849. ScheduleEnd = I->getNextNode();
  2850. assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
  2851. DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
  2852. return true;
  2853. }
  2854. DownIter++;
  2855. }
  2856. assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
  2857. "instruction not found in block");
  2858. }
  2859. return true;
  2860. }
  2861. void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
  2862. Instruction *ToI,
  2863. ScheduleData *PrevLoadStore,
  2864. ScheduleData *NextLoadStore) {
  2865. ScheduleData *CurrentLoadStore = PrevLoadStore;
  2866. for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
  2867. ScheduleData *SD = ScheduleDataMap[I];
  2868. if (!SD) {
  2869. // Allocate a new ScheduleData for the instruction.
  2870. if (ChunkPos >= ChunkSize) {
  2871. ScheduleDataChunks.push_back(
  2872. llvm::make_unique<ScheduleData[]>(ChunkSize));
  2873. ChunkPos = 0;
  2874. }
  2875. SD = &(ScheduleDataChunks.back()[ChunkPos++]);
  2876. ScheduleDataMap[I] = SD;
  2877. SD->Inst = I;
  2878. }
  2879. assert(!isInSchedulingRegion(SD) &&
  2880. "new ScheduleData already in scheduling region");
  2881. SD->init(SchedulingRegionID);
  2882. if (I->mayReadOrWriteMemory()) {
  2883. // Update the linked list of memory accessing instructions.
  2884. if (CurrentLoadStore) {
  2885. CurrentLoadStore->NextLoadStore = SD;
  2886. } else {
  2887. FirstLoadStoreInRegion = SD;
  2888. }
  2889. CurrentLoadStore = SD;
  2890. }
  2891. }
  2892. if (NextLoadStore) {
  2893. if (CurrentLoadStore)
  2894. CurrentLoadStore->NextLoadStore = NextLoadStore;
  2895. } else {
  2896. LastLoadStoreInRegion = CurrentLoadStore;
  2897. }
  2898. }
  2899. void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
  2900. bool InsertInReadyList,
  2901. BoUpSLP *SLP) {
  2902. assert(SD->isSchedulingEntity());
  2903. SmallVector<ScheduleData *, 10> WorkList;
  2904. WorkList.push_back(SD);
  2905. while (!WorkList.empty()) {
  2906. ScheduleData *SD = WorkList.back();
  2907. WorkList.pop_back();
  2908. ScheduleData *BundleMember = SD;
  2909. while (BundleMember) {
  2910. assert(isInSchedulingRegion(BundleMember));
  2911. if (!BundleMember->hasValidDependencies()) {
  2912. DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
  2913. BundleMember->Dependencies = 0;
  2914. BundleMember->resetUnscheduledDeps();
  2915. // Handle def-use chain dependencies.
  2916. for (User *U : BundleMember->Inst->users()) {
  2917. if (isa<Instruction>(U)) {
  2918. ScheduleData *UseSD = getScheduleData(U);
  2919. if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
  2920. BundleMember->Dependencies++;
  2921. ScheduleData *DestBundle = UseSD->FirstInBundle;
  2922. if (!DestBundle->IsScheduled)
  2923. BundleMember->incrementUnscheduledDeps(1);
  2924. if (!DestBundle->hasValidDependencies())
  2925. WorkList.push_back(DestBundle);
  2926. }
  2927. } else {
  2928. // I'm not sure if this can ever happen. But we need to be safe.
  2929. // This lets the instruction/bundle never be scheduled and
  2930. // eventually disable vectorization.
  2931. BundleMember->Dependencies++;
  2932. BundleMember->incrementUnscheduledDeps(1);
  2933. }
  2934. }
  2935. // Handle the memory dependencies.
  2936. ScheduleData *DepDest = BundleMember->NextLoadStore;
  2937. if (DepDest) {
  2938. Instruction *SrcInst = BundleMember->Inst;
  2939. MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
  2940. bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
  2941. unsigned numAliased = 0;
  2942. unsigned DistToSrc = 1;
  2943. while (DepDest) {
  2944. assert(isInSchedulingRegion(DepDest));
  2945. // We have two limits to reduce the complexity:
  2946. // 1) AliasedCheckLimit: It's a small limit to reduce calls to
  2947. // SLP->isAliased (which is the expensive part in this loop).
  2948. // 2) MaxMemDepDistance: It's for very large blocks and it aborts
  2949. // the whole loop (even if the loop is fast, it's quadratic).
  2950. // It's important for the loop break condition (see below) to
  2951. // check this limit even between two read-only instructions.
  2952. if (DistToSrc >= MaxMemDepDistance ||
  2953. ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
  2954. (numAliased >= AliasedCheckLimit ||
  2955. SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
  2956. // We increment the counter only if the locations are aliased
  2957. // (instead of counting all alias checks). This gives a better
  2958. // balance between reduced runtime and accurate dependencies.
  2959. numAliased++;
  2960. DepDest->MemoryDependencies.push_back(BundleMember);
  2961. BundleMember->Dependencies++;
  2962. ScheduleData *DestBundle = DepDest->FirstInBundle;
  2963. if (!DestBundle->IsScheduled) {
  2964. BundleMember->incrementUnscheduledDeps(1);
  2965. }
  2966. if (!DestBundle->hasValidDependencies()) {
  2967. WorkList.push_back(DestBundle);
  2968. }
  2969. }
  2970. DepDest = DepDest->NextLoadStore;
  2971. // Example, explaining the loop break condition: Let's assume our
  2972. // starting instruction is i0 and MaxMemDepDistance = 3.
  2973. //
  2974. // +--------v--v--v
  2975. // i0,i1,i2,i3,i4,i5,i6,i7,i8
  2976. // +--------^--^--^
  2977. //
  2978. // MaxMemDepDistance let us stop alias-checking at i3 and we add
  2979. // dependencies from i0 to i3,i4,.. (even if they are not aliased).
  2980. // Previously we already added dependencies from i3 to i6,i7,i8
  2981. // (because of MaxMemDepDistance). As we added a dependency from
  2982. // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
  2983. // and we can abort this loop at i6.
  2984. if (DistToSrc >= 2 * MaxMemDepDistance)
  2985. break;
  2986. DistToSrc++;
  2987. }
  2988. }
  2989. }
  2990. BundleMember = BundleMember->NextInBundle;
  2991. }
  2992. if (InsertInReadyList && SD->isReady()) {
  2993. ReadyInsts.push_back(SD);
  2994. DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
  2995. }
  2996. }
  2997. }
  2998. void BoUpSLP::BlockScheduling::resetSchedule() {
  2999. assert(ScheduleStart &&
  3000. "tried to reset schedule on block which has not been scheduled");
  3001. for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
  3002. ScheduleData *SD = getScheduleData(I);
  3003. assert(isInSchedulingRegion(SD));
  3004. SD->IsScheduled = false;
  3005. SD->resetUnscheduledDeps();
  3006. }
  3007. ReadyInsts.clear();
  3008. }
  3009. void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
  3010. if (!BS->ScheduleStart)
  3011. return;
  3012. DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
  3013. BS->resetSchedule();
  3014. // For the real scheduling we use a more sophisticated ready-list: it is
  3015. // sorted by the original instruction location. This lets the final schedule
  3016. // be as close as possible to the original instruction order.
  3017. struct ScheduleDataCompare {
  3018. bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
  3019. return SD2->SchedulingPriority < SD1->SchedulingPriority;
  3020. }
  3021. };
  3022. std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
  3023. // Ensure that all dependency data is updated and fill the ready-list with
  3024. // initial instructions.
  3025. int Idx = 0;
  3026. int NumToSchedule = 0;
  3027. for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
  3028. I = I->getNextNode()) {
  3029. ScheduleData *SD = BS->getScheduleData(I);
  3030. assert(
  3031. SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr) &&
  3032. "scheduler and vectorizer have different opinion on what is a bundle");
  3033. SD->FirstInBundle->SchedulingPriority = Idx++;
  3034. if (SD->isSchedulingEntity()) {
  3035. BS->calculateDependencies(SD, false, this);
  3036. NumToSchedule++;
  3037. }
  3038. }
  3039. BS->initialFillReadyList(ReadyInsts);
  3040. Instruction *LastScheduledInst = BS->ScheduleEnd;
  3041. // Do the "real" scheduling.
  3042. while (!ReadyInsts.empty()) {
  3043. ScheduleData *picked = *ReadyInsts.begin();
  3044. ReadyInsts.erase(ReadyInsts.begin());
  3045. // Move the scheduled instruction(s) to their dedicated places, if not
  3046. // there yet.
  3047. ScheduleData *BundleMember = picked;
  3048. while (BundleMember) {
  3049. Instruction *pickedInst = BundleMember->Inst;
  3050. if (LastScheduledInst->getNextNode() != pickedInst) {
  3051. BS->BB->getInstList().remove(pickedInst);
  3052. BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
  3053. pickedInst);
  3054. }
  3055. LastScheduledInst = pickedInst;
  3056. BundleMember = BundleMember->NextInBundle;
  3057. }
  3058. BS->schedule(picked, ReadyInsts);
  3059. NumToSchedule--;
  3060. }
  3061. assert(NumToSchedule == 0 && "could not schedule all instructions");
  3062. // Avoid duplicate scheduling of the block.
  3063. BS->ScheduleStart = nullptr;
  3064. }
  3065. unsigned BoUpSLP::getVectorElementSize(Value *V) {
  3066. // If V is a store, just return the width of the stored value without
  3067. // traversing the expression tree. This is the common case.
  3068. if (auto *Store = dyn_cast<StoreInst>(V))
  3069. return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
  3070. // If V is not a store, we can traverse the expression tree to find loads
  3071. // that feed it. The type of the loaded value may indicate a more suitable
  3072. // width than V's type. We want to base the vector element size on the width
  3073. // of memory operations where possible.
  3074. SmallVector<Instruction *, 16> Worklist;
  3075. SmallPtrSet<Instruction *, 16> Visited;
  3076. if (auto *I = dyn_cast<Instruction>(V))
  3077. Worklist.push_back(I);
  3078. // Traverse the expression tree in bottom-up order looking for loads. If we
  3079. // encounter an instruciton we don't yet handle, we give up.
  3080. auto MaxWidth = 0u;
  3081. auto FoundUnknownInst = false;
  3082. while (!Worklist.empty() && !FoundUnknownInst) {
  3083. auto *I = Worklist.pop_back_val();
  3084. Visited.insert(I);
  3085. // We should only be looking at scalar instructions here. If the current
  3086. // instruction has a vector type, give up.
  3087. auto *Ty = I->getType();
  3088. if (isa<VectorType>(Ty))
  3089. FoundUnknownInst = true;
  3090. // If the current instruction is a load, update MaxWidth to reflect the
  3091. // width of the loaded value.
  3092. else if (isa<LoadInst>(I))
  3093. MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
  3094. // Otherwise, we need to visit the operands of the instruction. We only
  3095. // handle the interesting cases from buildTree here. If an operand is an
  3096. // instruction we haven't yet visited, we add it to the worklist.
  3097. else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
  3098. isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
  3099. for (Use &U : I->operands())
  3100. if (auto *J = dyn_cast<Instruction>(U.get()))
  3101. if (!Visited.count(J))
  3102. Worklist.push_back(J);
  3103. }
  3104. // If we don't yet handle the instruction, give up.
  3105. else
  3106. FoundUnknownInst = true;
  3107. }
  3108. // If we didn't encounter a memory access in the expression tree, or if we
  3109. // gave up for some reason, just return the width of V.
  3110. if (!MaxWidth || FoundUnknownInst)
  3111. return DL->getTypeSizeInBits(V->getType());
  3112. // Otherwise, return the maximum width we found.
  3113. return MaxWidth;
  3114. }
  3115. // Determine if a value V in a vectorizable expression Expr can be demoted to a
  3116. // smaller type with a truncation. We collect the values that will be demoted
  3117. // in ToDemote and additional roots that require investigating in Roots.
  3118. static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
  3119. SmallVectorImpl<Value *> &ToDemote,
  3120. SmallVectorImpl<Value *> &Roots) {
  3121. // We can always demote constants.
  3122. if (isa<Constant>(V)) {
  3123. ToDemote.push_back(V);
  3124. return true;
  3125. }
  3126. // If the value is not an instruction in the expression with only one use, it
  3127. // cannot be demoted.
  3128. auto *I = dyn_cast<Instruction>(V);
  3129. if (!I || !I->hasOneUse() || !Expr.count(I))
  3130. return false;
  3131. switch (I->getOpcode()) {
  3132. // We can always demote truncations and extensions. Since truncations can
  3133. // seed additional demotion, we save the truncated value.
  3134. case Instruction::Trunc:
  3135. Roots.push_back(I->getOperand(0));
  3136. case Instruction::ZExt:
  3137. case Instruction::SExt:
  3138. break;
  3139. // We can demote certain binary operations if we can demote both of their
  3140. // operands.
  3141. case Instruction::Add:
  3142. case Instruction::Sub:
  3143. case Instruction::Mul:
  3144. case Instruction::And:
  3145. case Instruction::Or:
  3146. case Instruction::Xor:
  3147. if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
  3148. !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
  3149. return false;
  3150. break;
  3151. // We can demote selects if we can demote their true and false values.
  3152. case Instruction::Select: {
  3153. SelectInst *SI = cast<SelectInst>(I);
  3154. if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
  3155. !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
  3156. return false;
  3157. break;
  3158. }
  3159. // We can demote phis if we can demote all their incoming operands. Note that
  3160. // we don't need to worry about cycles since we ensure single use above.
  3161. case Instruction::PHI: {
  3162. PHINode *PN = cast<PHINode>(I);
  3163. for (Value *IncValue : PN->incoming_values())
  3164. if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
  3165. return false;
  3166. break;
  3167. }
  3168. // Otherwise, conservatively give up.
  3169. default:
  3170. return false;
  3171. }
  3172. // Record the value that we can demote.
  3173. ToDemote.push_back(V);
  3174. return true;
  3175. }
  3176. void BoUpSLP::computeMinimumValueSizes() {
  3177. // If there are no external uses, the expression tree must be rooted by a
  3178. // store. We can't demote in-memory values, so there is nothing to do here.
  3179. if (ExternalUses.empty())
  3180. return;
  3181. // We only attempt to truncate integer expressions.
  3182. auto &TreeRoot = VectorizableTree[0].Scalars;
  3183. auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
  3184. if (!TreeRootIT)
  3185. return;
  3186. // If the expression is not rooted by a store, these roots should have
  3187. // external uses. We will rely on InstCombine to rewrite the expression in
  3188. // the narrower type. However, InstCombine only rewrites single-use values.
  3189. // This means that if a tree entry other than a root is used externally, it
  3190. // must have multiple uses and InstCombine will not rewrite it. The code
  3191. // below ensures that only the roots are used externally.
  3192. SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
  3193. for (auto &EU : ExternalUses)
  3194. if (!Expr.erase(EU.Scalar))
  3195. return;
  3196. if (!Expr.empty())
  3197. return;
  3198. // Collect the scalar values of the vectorizable expression. We will use this
  3199. // context to determine which values can be demoted. If we see a truncation,
  3200. // we mark it as seeding another demotion.
  3201. for (auto &Entry : VectorizableTree)
  3202. Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
  3203. // Ensure the roots of the vectorizable tree don't form a cycle. They must
  3204. // have a single external user that is not in the vectorizable tree.
  3205. for (auto *Root : TreeRoot)
  3206. if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
  3207. return;
  3208. // Conservatively determine if we can actually truncate the roots of the
  3209. // expression. Collect the values that can be demoted in ToDemote and
  3210. // additional roots that require investigating in Roots.
  3211. SmallVector<Value *, 32> ToDemote;
  3212. SmallVector<Value *, 4> Roots;
  3213. for (auto *Root : TreeRoot)
  3214. if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
  3215. return;
  3216. // The maximum bit width required to represent all the values that can be
  3217. // demoted without loss of precision. It would be safe to truncate the roots
  3218. // of the expression to this width.
  3219. auto MaxBitWidth = 8u;
  3220. // We first check if all the bits of the roots are demanded. If they're not,
  3221. // we can truncate the roots to this narrower type.
  3222. for (auto *Root : TreeRoot) {
  3223. auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
  3224. MaxBitWidth = std::max<unsigned>(
  3225. Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
  3226. }
  3227. // True if the roots can be zero-extended back to their original type, rather
  3228. // than sign-extended. We know that if the leading bits are not demanded, we
  3229. // can safely zero-extend. So we initialize IsKnownPositive to True.
  3230. bool IsKnownPositive = true;
  3231. // If all the bits of the roots are demanded, we can try a little harder to
  3232. // compute a narrower type. This can happen, for example, if the roots are
  3233. // getelementptr indices. InstCombine promotes these indices to the pointer
  3234. // width. Thus, all their bits are technically demanded even though the
  3235. // address computation might be vectorized in a smaller type.
  3236. //
  3237. // We start by looking at each entry that can be demoted. We compute the
  3238. // maximum bit width required to store the scalar by using ValueTracking to
  3239. // compute the number of high-order bits we can truncate.
  3240. if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
  3241. MaxBitWidth = 8u;
  3242. // Determine if the sign bit of all the roots is known to be zero. If not,
  3243. // IsKnownPositive is set to False.
  3244. IsKnownPositive = all_of(TreeRoot, [&](Value *R) {
  3245. KnownBits Known = computeKnownBits(R, *DL);
  3246. return Known.isNonNegative();
  3247. });
  3248. // Determine the maximum number of bits required to store the scalar
  3249. // values.
  3250. for (auto *Scalar : ToDemote) {
  3251. auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
  3252. auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
  3253. MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
  3254. }
  3255. // If we can't prove that the sign bit is zero, we must add one to the
  3256. // maximum bit width to account for the unknown sign bit. This preserves
  3257. // the existing sign bit so we can safely sign-extend the root back to the
  3258. // original type. Otherwise, if we know the sign bit is zero, we will
  3259. // zero-extend the root instead.
  3260. //
  3261. // FIXME: This is somewhat suboptimal, as there will be cases where adding
  3262. // one to the maximum bit width will yield a larger-than-necessary
  3263. // type. In general, we need to add an extra bit only if we can't
  3264. // prove that the upper bit of the original type is equal to the
  3265. // upper bit of the proposed smaller type. If these two bits are the
  3266. // same (either zero or one) we know that sign-extending from the
  3267. // smaller type will result in the same value. Here, since we can't
  3268. // yet prove this, we are just making the proposed smaller type
  3269. // larger to ensure correctness.
  3270. if (!IsKnownPositive)
  3271. ++MaxBitWidth;
  3272. }
  3273. // Round MaxBitWidth up to the next power-of-two.
  3274. if (!isPowerOf2_64(MaxBitWidth))
  3275. MaxBitWidth = NextPowerOf2(MaxBitWidth);
  3276. // If the maximum bit width we compute is less than the with of the roots'
  3277. // type, we can proceed with the narrowing. Otherwise, do nothing.
  3278. if (MaxBitWidth >= TreeRootIT->getBitWidth())
  3279. return;
  3280. // If we can truncate the root, we must collect additional values that might
  3281. // be demoted as a result. That is, those seeded by truncations we will
  3282. // modify.
  3283. while (!Roots.empty())
  3284. collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
  3285. // Finally, map the values we can demote to the maximum bit with we computed.
  3286. for (auto *Scalar : ToDemote)
  3287. MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
  3288. }
  3289. namespace {
  3290. /// The SLPVectorizer Pass.
  3291. struct SLPVectorizer : public FunctionPass {
  3292. SLPVectorizerPass Impl;
  3293. /// Pass identification, replacement for typeid
  3294. static char ID;
  3295. explicit SLPVectorizer() : FunctionPass(ID) {
  3296. initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
  3297. }
  3298. bool doInitialization(Module &M) override {
  3299. return false;
  3300. }
  3301. bool runOnFunction(Function &F) override {
  3302. if (skipFunction(F))
  3303. return false;
  3304. auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
  3305. auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
  3306. auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
  3307. auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
  3308. auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  3309. auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
  3310. auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  3311. auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
  3312. auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
  3313. auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
  3314. return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
  3315. }
  3316. void getAnalysisUsage(AnalysisUsage &AU) const override {
  3317. FunctionPass::getAnalysisUsage(AU);
  3318. AU.addRequired<AssumptionCacheTracker>();
  3319. AU.addRequired<ScalarEvolutionWrapperPass>();
  3320. AU.addRequired<AAResultsWrapperPass>();
  3321. AU.addRequired<TargetTransformInfoWrapperPass>();
  3322. AU.addRequired<LoopInfoWrapperPass>();
  3323. AU.addRequired<DominatorTreeWrapperPass>();
  3324. AU.addRequired<DemandedBitsWrapperPass>();
  3325. AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
  3326. AU.addPreserved<LoopInfoWrapperPass>();
  3327. AU.addPreserved<DominatorTreeWrapperPass>();
  3328. AU.addPreserved<AAResultsWrapperPass>();
  3329. AU.addPreserved<GlobalsAAWrapperPass>();
  3330. AU.setPreservesCFG();
  3331. }
  3332. };
  3333. } // end anonymous namespace
  3334. PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
  3335. auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
  3336. auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
  3337. auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
  3338. auto *AA = &AM.getResult<AAManager>(F);
  3339. auto *LI = &AM.getResult<LoopAnalysis>(F);
  3340. auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
  3341. auto *AC = &AM.getResult<AssumptionAnalysis>(F);
  3342. auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
  3343. auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
  3344. bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
  3345. if (!Changed)
  3346. return PreservedAnalyses::all();
  3347. PreservedAnalyses PA;
  3348. PA.preserveSet<CFGAnalyses>();
  3349. PA.preserve<AAManager>();
  3350. PA.preserve<GlobalsAA>();
  3351. return PA;
  3352. }
  3353. bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
  3354. TargetTransformInfo *TTI_,
  3355. TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
  3356. LoopInfo *LI_, DominatorTree *DT_,
  3357. AssumptionCache *AC_, DemandedBits *DB_,
  3358. OptimizationRemarkEmitter *ORE_) {
  3359. SE = SE_;
  3360. TTI = TTI_;
  3361. TLI = TLI_;
  3362. AA = AA_;
  3363. LI = LI_;
  3364. DT = DT_;
  3365. AC = AC_;
  3366. DB = DB_;
  3367. DL = &F.getParent()->getDataLayout();
  3368. Stores.clear();
  3369. GEPs.clear();
  3370. bool Changed = false;
  3371. // If the target claims to have no vector registers don't attempt
  3372. // vectorization.
  3373. if (!TTI->getNumberOfRegisters(true))
  3374. return false;
  3375. // Don't vectorize when the attribute NoImplicitFloat is used.
  3376. if (F.hasFnAttribute(Attribute::NoImplicitFloat))
  3377. return false;
  3378. DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
  3379. // Use the bottom up slp vectorizer to construct chains that start with
  3380. // store instructions.
  3381. BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
  3382. // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
  3383. // delete instructions.
  3384. // Scan the blocks in the function in post order.
  3385. for (auto BB : post_order(&F.getEntryBlock())) {
  3386. collectSeedInstructions(BB);
  3387. // Vectorize trees that end at stores.
  3388. if (!Stores.empty()) {
  3389. DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
  3390. << " underlying objects.\n");
  3391. Changed |= vectorizeStoreChains(R);
  3392. }
  3393. // Vectorize trees that end at reductions.
  3394. Changed |= vectorizeChainsInBlock(BB, R);
  3395. // Vectorize the index computations of getelementptr instructions. This
  3396. // is primarily intended to catch gather-like idioms ending at
  3397. // non-consecutive loads.
  3398. if (!GEPs.empty()) {
  3399. DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
  3400. << " underlying objects.\n");
  3401. Changed |= vectorizeGEPIndices(BB, R);
  3402. }
  3403. }
  3404. if (Changed) {
  3405. R.optimizeGatherSequence();
  3406. DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
  3407. DEBUG(verifyFunction(F));
  3408. }
  3409. return Changed;
  3410. }
  3411. /// \brief Check that the Values in the slice in VL array are still existent in
  3412. /// the WeakTrackingVH array.
  3413. /// Vectorization of part of the VL array may cause later values in the VL array
  3414. /// to become invalid. We track when this has happened in the WeakTrackingVH
  3415. /// array.
  3416. static bool hasValueBeenRAUWed(ArrayRef<Value *> VL,
  3417. ArrayRef<WeakTrackingVH> VH, unsigned SliceBegin,
  3418. unsigned SliceSize) {
  3419. VL = VL.slice(SliceBegin, SliceSize);
  3420. VH = VH.slice(SliceBegin, SliceSize);
  3421. return !std::equal(VL.begin(), VL.end(), VH.begin());
  3422. }
  3423. bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
  3424. unsigned VecRegSize) {
  3425. unsigned ChainLen = Chain.size();
  3426. DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
  3427. << "\n");
  3428. unsigned Sz = R.getVectorElementSize(Chain[0]);
  3429. unsigned VF = VecRegSize / Sz;
  3430. if (!isPowerOf2_32(Sz) || VF < 2)
  3431. return false;
  3432. // Keep track of values that were deleted by vectorizing in the loop below.
  3433. SmallVector<WeakTrackingVH, 8> TrackValues(Chain.begin(), Chain.end());
  3434. bool Changed = false;
  3435. // Look for profitable vectorizable trees at all offsets, starting at zero.
  3436. for (unsigned i = 0, e = ChainLen; i < e; ++i) {
  3437. if (i + VF > e)
  3438. break;
  3439. // Check that a previous iteration of this loop did not delete the Value.
  3440. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
  3441. continue;
  3442. DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
  3443. << "\n");
  3444. ArrayRef<Value *> Operands = Chain.slice(i, VF);
  3445. R.buildTree(Operands);
  3446. if (R.isTreeTinyAndNotFullyVectorizable())
  3447. continue;
  3448. R.computeMinimumValueSizes();
  3449. int Cost = R.getTreeCost();
  3450. DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
  3451. if (Cost < -SLPCostThreshold) {
  3452. DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
  3453. using namespace ore;
  3454. R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
  3455. cast<StoreInst>(Chain[i]))
  3456. << "Stores SLP vectorized with cost " << NV("Cost", Cost)
  3457. << " and with tree size "
  3458. << NV("TreeSize", R.getTreeSize()));
  3459. R.vectorizeTree();
  3460. // Move to the next bundle.
  3461. i += VF - 1;
  3462. Changed = true;
  3463. }
  3464. }
  3465. return Changed;
  3466. }
  3467. bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
  3468. BoUpSLP &R) {
  3469. SetVector<StoreInst *> Heads, Tails;
  3470. SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
  3471. // We may run into multiple chains that merge into a single chain. We mark the
  3472. // stores that we vectorized so that we don't visit the same store twice.
  3473. BoUpSLP::ValueSet VectorizedStores;
  3474. bool Changed = false;
  3475. // Do a quadratic search on all of the given stores and find
  3476. // all of the pairs of stores that follow each other.
  3477. SmallVector<unsigned, 16> IndexQueue;
  3478. for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
  3479. IndexQueue.clear();
  3480. // If a store has multiple consecutive store candidates, search Stores
  3481. // array according to the sequence: from i+1 to e, then from i-1 to 0.
  3482. // This is because usually pairing with immediate succeeding or preceding
  3483. // candidate create the best chance to find slp vectorization opportunity.
  3484. unsigned j = 0;
  3485. for (j = i + 1; j < e; ++j)
  3486. IndexQueue.push_back(j);
  3487. for (j = i; j > 0; --j)
  3488. IndexQueue.push_back(j - 1);
  3489. for (auto &k : IndexQueue) {
  3490. if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
  3491. Tails.insert(Stores[k]);
  3492. Heads.insert(Stores[i]);
  3493. ConsecutiveChain[Stores[i]] = Stores[k];
  3494. break;
  3495. }
  3496. }
  3497. }
  3498. // For stores that start but don't end a link in the chain:
  3499. for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
  3500. it != e; ++it) {
  3501. if (Tails.count(*it))
  3502. continue;
  3503. // We found a store instr that starts a chain. Now follow the chain and try
  3504. // to vectorize it.
  3505. BoUpSLP::ValueList Operands;
  3506. StoreInst *I = *it;
  3507. // Collect the chain into a list.
  3508. while (Tails.count(I) || Heads.count(I)) {
  3509. if (VectorizedStores.count(I))
  3510. break;
  3511. Operands.push_back(I);
  3512. // Move to the next value in the chain.
  3513. I = ConsecutiveChain[I];
  3514. }
  3515. // FIXME: Is division-by-2 the correct step? Should we assert that the
  3516. // register size is a power-of-2?
  3517. for (unsigned Size = R.getMaxVecRegSize(); Size >= R.getMinVecRegSize();
  3518. Size /= 2) {
  3519. if (vectorizeStoreChain(Operands, R, Size)) {
  3520. // Mark the vectorized stores so that we don't vectorize them again.
  3521. VectorizedStores.insert(Operands.begin(), Operands.end());
  3522. Changed = true;
  3523. break;
  3524. }
  3525. }
  3526. }
  3527. return Changed;
  3528. }
  3529. void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
  3530. // Initialize the collections. We will make a single pass over the block.
  3531. Stores.clear();
  3532. GEPs.clear();
  3533. // Visit the store and getelementptr instructions in BB and organize them in
  3534. // Stores and GEPs according to the underlying objects of their pointer
  3535. // operands.
  3536. for (Instruction &I : *BB) {
  3537. // Ignore store instructions that are volatile or have a pointer operand
  3538. // that doesn't point to a scalar type.
  3539. if (auto *SI = dyn_cast<StoreInst>(&I)) {
  3540. if (!SI->isSimple())
  3541. continue;
  3542. if (!isValidElementType(SI->getValueOperand()->getType()))
  3543. continue;
  3544. Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
  3545. }
  3546. // Ignore getelementptr instructions that have more than one index, a
  3547. // constant index, or a pointer operand that doesn't point to a scalar
  3548. // type.
  3549. else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
  3550. auto Idx = GEP->idx_begin()->get();
  3551. if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
  3552. continue;
  3553. if (!isValidElementType(Idx->getType()))
  3554. continue;
  3555. if (GEP->getType()->isVectorTy())
  3556. continue;
  3557. GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
  3558. }
  3559. }
  3560. }
  3561. bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  3562. if (!A || !B)
  3563. return false;
  3564. Value *VL[] = { A, B };
  3565. return tryToVectorizeList(VL, R, None, true);
  3566. }
  3567. bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
  3568. ArrayRef<Value *> BuildVector,
  3569. bool AllowReorder) {
  3570. if (VL.size() < 2)
  3571. return false;
  3572. DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = " << VL.size()
  3573. << ".\n");
  3574. // Check that all of the parts are scalar instructions of the same type.
  3575. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  3576. if (!I0)
  3577. return false;
  3578. unsigned Opcode0 = I0->getOpcode();
  3579. unsigned Sz = R.getVectorElementSize(I0);
  3580. unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
  3581. unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
  3582. if (MaxVF < 2)
  3583. return false;
  3584. for (Value *V : VL) {
  3585. Type *Ty = V->getType();
  3586. if (!isValidElementType(Ty))
  3587. return false;
  3588. Instruction *Inst = dyn_cast<Instruction>(V);
  3589. if (!Inst || Inst->getOpcode() != Opcode0)
  3590. return false;
  3591. }
  3592. bool Changed = false;
  3593. // Keep track of values that were deleted by vectorizing in the loop below.
  3594. SmallVector<WeakTrackingVH, 8> TrackValues(VL.begin(), VL.end());
  3595. unsigned NextInst = 0, MaxInst = VL.size();
  3596. for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
  3597. VF /= 2) {
  3598. // No actual vectorization should happen, if number of parts is the same as
  3599. // provided vectorization factor (i.e. the scalar type is used for vector
  3600. // code during codegen).
  3601. auto *VecTy = VectorType::get(VL[0]->getType(), VF);
  3602. if (TTI->getNumberOfParts(VecTy) == VF)
  3603. continue;
  3604. for (unsigned I = NextInst; I < MaxInst; ++I) {
  3605. unsigned OpsWidth = 0;
  3606. if (I + VF > MaxInst)
  3607. OpsWidth = MaxInst - I;
  3608. else
  3609. OpsWidth = VF;
  3610. if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
  3611. break;
  3612. // Check that a previous iteration of this loop did not delete the Value.
  3613. if (hasValueBeenRAUWed(VL, TrackValues, I, OpsWidth))
  3614. continue;
  3615. DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
  3616. << "\n");
  3617. ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
  3618. ArrayRef<Value *> BuildVectorSlice;
  3619. if (!BuildVector.empty())
  3620. BuildVectorSlice = BuildVector.slice(I, OpsWidth);
  3621. R.buildTree(Ops, BuildVectorSlice);
  3622. // TODO: check if we can allow reordering for more cases.
  3623. if (AllowReorder && R.shouldReorder()) {
  3624. // Conceptually, there is nothing actually preventing us from trying to
  3625. // reorder a larger list. In fact, we do exactly this when vectorizing
  3626. // reductions. However, at this point, we only expect to get here when
  3627. // there are exactly two operations.
  3628. assert(Ops.size() == 2);
  3629. assert(BuildVectorSlice.empty());
  3630. Value *ReorderedOps[] = {Ops[1], Ops[0]};
  3631. R.buildTree(ReorderedOps, None);
  3632. }
  3633. if (R.isTreeTinyAndNotFullyVectorizable())
  3634. continue;
  3635. R.computeMinimumValueSizes();
  3636. int Cost = R.getTreeCost();
  3637. if (Cost < -SLPCostThreshold) {
  3638. DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
  3639. R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
  3640. cast<Instruction>(Ops[0]))
  3641. << "SLP vectorized with cost " << ore::NV("Cost", Cost)
  3642. << " and with tree size "
  3643. << ore::NV("TreeSize", R.getTreeSize()));
  3644. Value *VectorizedRoot = R.vectorizeTree();
  3645. // Reconstruct the build vector by extracting the vectorized root. This
  3646. // way we handle the case where some elements of the vector are
  3647. // undefined.
  3648. // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
  3649. if (!BuildVectorSlice.empty()) {
  3650. // The insert point is the last build vector instruction. The
  3651. // vectorized root will precede it. This guarantees that we get an
  3652. // instruction. The vectorized tree could have been constant folded.
  3653. Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
  3654. unsigned VecIdx = 0;
  3655. for (auto &V : BuildVectorSlice) {
  3656. IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
  3657. ++BasicBlock::iterator(InsertAfter));
  3658. Instruction *I = cast<Instruction>(V);
  3659. assert(isa<InsertElementInst>(I) || isa<InsertValueInst>(I));
  3660. Instruction *Extract =
  3661. cast<Instruction>(Builder.CreateExtractElement(
  3662. VectorizedRoot, Builder.getInt32(VecIdx++)));
  3663. I->setOperand(1, Extract);
  3664. I->removeFromParent();
  3665. I->insertAfter(Extract);
  3666. InsertAfter = I;
  3667. }
  3668. }
  3669. // Move to the next bundle.
  3670. I += VF - 1;
  3671. NextInst = I + 1;
  3672. Changed = true;
  3673. }
  3674. }
  3675. }
  3676. return Changed;
  3677. }
  3678. bool SLPVectorizerPass::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
  3679. if (!V)
  3680. return false;
  3681. Value *P = V->getParent();
  3682. // Vectorize in current basic block only.
  3683. auto *Op0 = dyn_cast<Instruction>(V->getOperand(0));
  3684. auto *Op1 = dyn_cast<Instruction>(V->getOperand(1));
  3685. if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
  3686. return false;
  3687. // Try to vectorize V.
  3688. if (tryToVectorizePair(Op0, Op1, R))
  3689. return true;
  3690. auto *A = dyn_cast<BinaryOperator>(Op0);
  3691. auto *B = dyn_cast<BinaryOperator>(Op1);
  3692. // Try to skip B.
  3693. if (B && B->hasOneUse()) {
  3694. auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
  3695. auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
  3696. if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
  3697. return true;
  3698. if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
  3699. return true;
  3700. }
  3701. // Try to skip A.
  3702. if (A && A->hasOneUse()) {
  3703. auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
  3704. auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
  3705. if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
  3706. return true;
  3707. if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
  3708. return true;
  3709. }
  3710. return false;
  3711. }
  3712. /// \brief Generate a shuffle mask to be used in a reduction tree.
  3713. ///
  3714. /// \param VecLen The length of the vector to be reduced.
  3715. /// \param NumEltsToRdx The number of elements that should be reduced in the
  3716. /// vector.
  3717. /// \param IsPairwise Whether the reduction is a pairwise or splitting
  3718. /// reduction. A pairwise reduction will generate a mask of
  3719. /// <0,2,...> or <1,3,..> while a splitting reduction will generate
  3720. /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
  3721. /// \param IsLeft True will generate a mask of even elements, odd otherwise.
  3722. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
  3723. bool IsPairwise, bool IsLeft,
  3724. IRBuilder<> &Builder) {
  3725. assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
  3726. SmallVector<Constant *, 32> ShuffleMask(
  3727. VecLen, UndefValue::get(Builder.getInt32Ty()));
  3728. if (IsPairwise)
  3729. // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
  3730. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3731. ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
  3732. else
  3733. // Move the upper half of the vector to the lower half.
  3734. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  3735. ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
  3736. return ConstantVector::get(ShuffleMask);
  3737. }
  3738. namespace {
  3739. /// Model horizontal reductions.
  3740. ///
  3741. /// A horizontal reduction is a tree of reduction operations (currently add and
  3742. /// fadd) that has operations that can be put into a vector as its leaf.
  3743. /// For example, this tree:
  3744. ///
  3745. /// mul mul mul mul
  3746. /// \ / \ /
  3747. /// + +
  3748. /// \ /
  3749. /// +
  3750. /// This tree has "mul" as its reduced values and "+" as its reduction
  3751. /// operations. A reduction might be feeding into a store or a binary operation
  3752. /// feeding a phi.
  3753. /// ...
  3754. /// \ /
  3755. /// +
  3756. /// |
  3757. /// phi +=
  3758. ///
  3759. /// Or:
  3760. /// ...
  3761. /// \ /
  3762. /// +
  3763. /// |
  3764. /// *p =
  3765. ///
  3766. class HorizontalReduction {
  3767. SmallVector<Value *, 16> ReductionOps;
  3768. SmallVector<Value *, 32> ReducedVals;
  3769. // Use map vector to make stable output.
  3770. MapVector<Instruction *, Value *> ExtraArgs;
  3771. BinaryOperator *ReductionRoot = nullptr;
  3772. /// The opcode of the reduction.
  3773. Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
  3774. /// The opcode of the values we perform a reduction on.
  3775. unsigned ReducedValueOpcode = 0;
  3776. /// Should we model this reduction as a pairwise reduction tree or a tree that
  3777. /// splits the vector in halves and adds those halves.
  3778. bool IsPairwiseReduction = false;
  3779. /// Checks if the ParentStackElem.first should be marked as a reduction
  3780. /// operation with an extra argument or as extra argument itself.
  3781. void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
  3782. Value *ExtraArg) {
  3783. if (ExtraArgs.count(ParentStackElem.first)) {
  3784. ExtraArgs[ParentStackElem.first] = nullptr;
  3785. // We ran into something like:
  3786. // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
  3787. // The whole ParentStackElem.first should be considered as an extra value
  3788. // in this case.
  3789. // Do not perform analysis of remaining operands of ParentStackElem.first
  3790. // instruction, this whole instruction is an extra argument.
  3791. ParentStackElem.second = ParentStackElem.first->getNumOperands();
  3792. } else {
  3793. // We ran into something like:
  3794. // ParentStackElem.first += ... + ExtraArg + ...
  3795. ExtraArgs[ParentStackElem.first] = ExtraArg;
  3796. }
  3797. }
  3798. public:
  3799. HorizontalReduction() = default;
  3800. /// \brief Try to find a reduction tree.
  3801. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
  3802. assert((!Phi || is_contained(Phi->operands(), B)) &&
  3803. "Thi phi needs to use the binary operator");
  3804. // We could have a initial reductions that is not an add.
  3805. // r *= v1 + v2 + v3 + v4
  3806. // In such a case start looking for a tree rooted in the first '+'.
  3807. if (Phi) {
  3808. if (B->getOperand(0) == Phi) {
  3809. Phi = nullptr;
  3810. B = dyn_cast<BinaryOperator>(B->getOperand(1));
  3811. } else if (B->getOperand(1) == Phi) {
  3812. Phi = nullptr;
  3813. B = dyn_cast<BinaryOperator>(B->getOperand(0));
  3814. }
  3815. }
  3816. if (!B)
  3817. return false;
  3818. Type *Ty = B->getType();
  3819. if (!isValidElementType(Ty))
  3820. return false;
  3821. ReductionOpcode = B->getOpcode();
  3822. ReducedValueOpcode = 0;
  3823. ReductionRoot = B;
  3824. // We currently only support adds.
  3825. if ((ReductionOpcode != Instruction::Add &&
  3826. ReductionOpcode != Instruction::FAdd) ||
  3827. !B->isAssociative())
  3828. return false;
  3829. // Post order traverse the reduction tree starting at B. We only handle true
  3830. // trees containing only binary operators or selects.
  3831. SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
  3832. Stack.push_back(std::make_pair(B, 0));
  3833. while (!Stack.empty()) {
  3834. Instruction *TreeN = Stack.back().first;
  3835. unsigned EdgeToVist = Stack.back().second++;
  3836. bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
  3837. // Postorder vist.
  3838. if (EdgeToVist == 2 || IsReducedValue) {
  3839. if (IsReducedValue)
  3840. ReducedVals.push_back(TreeN);
  3841. else {
  3842. auto I = ExtraArgs.find(TreeN);
  3843. if (I != ExtraArgs.end() && !I->second) {
  3844. // Check if TreeN is an extra argument of its parent operation.
  3845. if (Stack.size() <= 1) {
  3846. // TreeN can't be an extra argument as it is a root reduction
  3847. // operation.
  3848. return false;
  3849. }
  3850. // Yes, TreeN is an extra argument, do not add it to a list of
  3851. // reduction operations.
  3852. // Stack[Stack.size() - 2] always points to the parent operation.
  3853. markExtraArg(Stack[Stack.size() - 2], TreeN);
  3854. ExtraArgs.erase(TreeN);
  3855. } else
  3856. ReductionOps.push_back(TreeN);
  3857. }
  3858. // Retract.
  3859. Stack.pop_back();
  3860. continue;
  3861. }
  3862. // Visit left or right.
  3863. Value *NextV = TreeN->getOperand(EdgeToVist);
  3864. if (NextV != Phi) {
  3865. auto *I = dyn_cast<Instruction>(NextV);
  3866. // Continue analysis if the next operand is a reduction operation or
  3867. // (possibly) a reduced value. If the reduced value opcode is not set,
  3868. // the first met operation != reduction operation is considered as the
  3869. // reduced value class.
  3870. if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
  3871. I->getOpcode() == ReductionOpcode)) {
  3872. // Only handle trees in the current basic block.
  3873. if (I->getParent() != B->getParent()) {
  3874. // I is an extra argument for TreeN (its parent operation).
  3875. markExtraArg(Stack.back(), I);
  3876. continue;
  3877. }
  3878. // Each tree node needs to have one user except for the ultimate
  3879. // reduction.
  3880. if (!I->hasOneUse() && I != B) {
  3881. // I is an extra argument for TreeN (its parent operation).
  3882. markExtraArg(Stack.back(), I);
  3883. continue;
  3884. }
  3885. if (I->getOpcode() == ReductionOpcode) {
  3886. // We need to be able to reassociate the reduction operations.
  3887. if (!I->isAssociative()) {
  3888. // I is an extra argument for TreeN (its parent operation).
  3889. markExtraArg(Stack.back(), I);
  3890. continue;
  3891. }
  3892. } else if (ReducedValueOpcode &&
  3893. ReducedValueOpcode != I->getOpcode()) {
  3894. // Make sure that the opcodes of the operations that we are going to
  3895. // reduce match.
  3896. // I is an extra argument for TreeN (its parent operation).
  3897. markExtraArg(Stack.back(), I);
  3898. continue;
  3899. } else if (!ReducedValueOpcode)
  3900. ReducedValueOpcode = I->getOpcode();
  3901. Stack.push_back(std::make_pair(I, 0));
  3902. continue;
  3903. }
  3904. }
  3905. // NextV is an extra argument for TreeN (its parent operation).
  3906. markExtraArg(Stack.back(), NextV);
  3907. }
  3908. return true;
  3909. }
  3910. /// \brief Attempt to vectorize the tree found by
  3911. /// matchAssociativeReduction.
  3912. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
  3913. if (ReducedVals.empty())
  3914. return false;
  3915. // If there is a sufficient number of reduction values, reduce
  3916. // to a nearby power-of-2. Can safely generate oversized
  3917. // vectors and rely on the backend to split them to legal sizes.
  3918. unsigned NumReducedVals = ReducedVals.size();
  3919. if (NumReducedVals < 4)
  3920. return false;
  3921. unsigned ReduxWidth = PowerOf2Floor(NumReducedVals);
  3922. Value *VectorizedTree = nullptr;
  3923. IRBuilder<> Builder(ReductionRoot);
  3924. FastMathFlags Unsafe;
  3925. Unsafe.setUnsafeAlgebra();
  3926. Builder.setFastMathFlags(Unsafe);
  3927. unsigned i = 0;
  3928. BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
  3929. // The same extra argument may be used several time, so log each attempt
  3930. // to use it.
  3931. for (auto &Pair : ExtraArgs)
  3932. ExternallyUsedValues[Pair.second].push_back(Pair.first);
  3933. while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
  3934. auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
  3935. V.buildTree(VL, ExternallyUsedValues, ReductionOps);
  3936. if (V.shouldReorder()) {
  3937. SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
  3938. V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
  3939. }
  3940. if (V.isTreeTinyAndNotFullyVectorizable())
  3941. break;
  3942. V.computeMinimumValueSizes();
  3943. // Estimate cost.
  3944. int Cost =
  3945. V.getTreeCost() + getReductionCost(TTI, ReducedVals[i], ReduxWidth);
  3946. if (Cost >= -SLPCostThreshold)
  3947. break;
  3948. DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
  3949. << ". (HorRdx)\n");
  3950. auto *I0 = cast<Instruction>(VL[0]);
  3951. V.getORE()->emit(
  3952. OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
  3953. << "Vectorized horizontal reduction with cost "
  3954. << ore::NV("Cost", Cost) << " and with tree size "
  3955. << ore::NV("TreeSize", V.getTreeSize()));
  3956. // Vectorize a tree.
  3957. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
  3958. Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
  3959. // Emit a reduction.
  3960. Value *ReducedSubTree =
  3961. emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
  3962. if (VectorizedTree) {
  3963. Builder.SetCurrentDebugLocation(Loc);
  3964. VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
  3965. ReducedSubTree, "bin.rdx");
  3966. propagateIRFlags(VectorizedTree, ReductionOps);
  3967. } else
  3968. VectorizedTree = ReducedSubTree;
  3969. i += ReduxWidth;
  3970. ReduxWidth = PowerOf2Floor(NumReducedVals - i);
  3971. }
  3972. if (VectorizedTree) {
  3973. // Finish the reduction.
  3974. for (; i < NumReducedVals; ++i) {
  3975. auto *I = cast<Instruction>(ReducedVals[i]);
  3976. Builder.SetCurrentDebugLocation(I->getDebugLoc());
  3977. VectorizedTree =
  3978. Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
  3979. propagateIRFlags(VectorizedTree, ReductionOps);
  3980. }
  3981. for (auto &Pair : ExternallyUsedValues) {
  3982. assert(!Pair.second.empty() &&
  3983. "At least one DebugLoc must be inserted");
  3984. // Add each externally used value to the final reduction.
  3985. for (auto *I : Pair.second) {
  3986. Builder.SetCurrentDebugLocation(I->getDebugLoc());
  3987. VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
  3988. Pair.first, "bin.extra");
  3989. propagateIRFlags(VectorizedTree, I);
  3990. }
  3991. }
  3992. // Update users.
  3993. ReductionRoot->replaceAllUsesWith(VectorizedTree);
  3994. }
  3995. return VectorizedTree != nullptr;
  3996. }
  3997. unsigned numReductionValues() const {
  3998. return ReducedVals.size();
  3999. }
  4000. private:
  4001. /// \brief Calculate the cost of a reduction.
  4002. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
  4003. unsigned ReduxWidth) {
  4004. Type *ScalarTy = FirstReducedVal->getType();
  4005. Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
  4006. int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
  4007. int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
  4008. IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
  4009. int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
  4010. int ScalarReduxCost =
  4011. (ReduxWidth - 1) *
  4012. TTI->getArithmeticInstrCost(ReductionOpcode, ScalarTy);
  4013. DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
  4014. << " for reduction that starts with " << *FirstReducedVal
  4015. << " (It is a "
  4016. << (IsPairwiseReduction ? "pairwise" : "splitting")
  4017. << " reduction)\n");
  4018. return VecReduxCost - ScalarReduxCost;
  4019. }
  4020. /// \brief Emit a horizontal reduction of the vectorized value.
  4021. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
  4022. unsigned ReduxWidth, ArrayRef<Value *> RedOps,
  4023. const TargetTransformInfo *TTI) {
  4024. assert(VectorizedValue && "Need to have a vectorized tree node");
  4025. assert(isPowerOf2_32(ReduxWidth) &&
  4026. "We only handle power-of-two reductions for now");
  4027. if (!IsPairwiseReduction)
  4028. return createSimpleTargetReduction(
  4029. Builder, TTI, ReductionOpcode, VectorizedValue,
  4030. TargetTransformInfo::ReductionFlags(), RedOps);
  4031. Value *TmpVec = VectorizedValue;
  4032. for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
  4033. Value *LeftMask =
  4034. createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
  4035. Value *RightMask =
  4036. createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
  4037. Value *LeftShuf = Builder.CreateShuffleVector(
  4038. TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
  4039. Value *RightShuf = Builder.CreateShuffleVector(
  4040. TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
  4041. "rdx.shuf.r");
  4042. TmpVec =
  4043. Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
  4044. propagateIRFlags(TmpVec, RedOps);
  4045. }
  4046. // The result is in the first element of the vector.
  4047. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  4048. }
  4049. };
  4050. } // end anonymous namespace
  4051. /// \brief Recognize construction of vectors like
  4052. /// %ra = insertelement <4 x float> undef, float %s0, i32 0
  4053. /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
  4054. /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
  4055. /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
  4056. ///
  4057. /// Returns true if it matches
  4058. ///
  4059. static bool findBuildVector(InsertElementInst *FirstInsertElem,
  4060. SmallVectorImpl<Value *> &BuildVector,
  4061. SmallVectorImpl<Value *> &BuildVectorOpds) {
  4062. if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
  4063. return false;
  4064. InsertElementInst *IE = FirstInsertElem;
  4065. while (true) {
  4066. BuildVector.push_back(IE);
  4067. BuildVectorOpds.push_back(IE->getOperand(1));
  4068. if (IE->use_empty())
  4069. return false;
  4070. InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
  4071. if (!NextUse)
  4072. return true;
  4073. // If this isn't the final use, make sure the next insertelement is the only
  4074. // use. It's OK if the final constructed vector is used multiple times
  4075. if (!IE->hasOneUse())
  4076. return false;
  4077. IE = NextUse;
  4078. }
  4079. return false;
  4080. }
  4081. /// \brief Like findBuildVector, but looks backwards for construction of aggregate.
  4082. ///
  4083. /// \return true if it matches.
  4084. static bool findBuildAggregate(InsertValueInst *IV,
  4085. SmallVectorImpl<Value *> &BuildVector,
  4086. SmallVectorImpl<Value *> &BuildVectorOpds) {
  4087. Value *V;
  4088. do {
  4089. BuildVector.push_back(IV);
  4090. BuildVectorOpds.push_back(IV->getInsertedValueOperand());
  4091. V = IV->getAggregateOperand();
  4092. if (isa<UndefValue>(V))
  4093. break;
  4094. IV = dyn_cast<InsertValueInst>(V);
  4095. if (!IV || !IV->hasOneUse())
  4096. return false;
  4097. } while (true);
  4098. std::reverse(BuildVector.begin(), BuildVector.end());
  4099. std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
  4100. return true;
  4101. }
  4102. static bool PhiTypeSorterFunc(Value *V, Value *V2) {
  4103. return V->getType() < V2->getType();
  4104. }
  4105. /// \brief Try and get a reduction value from a phi node.
  4106. ///
  4107. /// Given a phi node \p P in a block \p ParentBB, consider possible reductions
  4108. /// if they come from either \p ParentBB or a containing loop latch.
  4109. ///
  4110. /// \returns A candidate reduction value if possible, or \code nullptr \endcode
  4111. /// if not possible.
  4112. static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
  4113. BasicBlock *ParentBB, LoopInfo *LI) {
  4114. // There are situations where the reduction value is not dominated by the
  4115. // reduction phi. Vectorizing such cases has been reported to cause
  4116. // miscompiles. See PR25787.
  4117. auto DominatedReduxValue = [&](Value *R) {
  4118. return (
  4119. dyn_cast<Instruction>(R) &&
  4120. DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
  4121. };
  4122. Value *Rdx = nullptr;
  4123. // Return the incoming value if it comes from the same BB as the phi node.
  4124. if (P->getIncomingBlock(0) == ParentBB) {
  4125. Rdx = P->getIncomingValue(0);
  4126. } else if (P->getIncomingBlock(1) == ParentBB) {
  4127. Rdx = P->getIncomingValue(1);
  4128. }
  4129. if (Rdx && DominatedReduxValue(Rdx))
  4130. return Rdx;
  4131. // Otherwise, check whether we have a loop latch to look at.
  4132. Loop *BBL = LI->getLoopFor(ParentBB);
  4133. if (!BBL)
  4134. return nullptr;
  4135. BasicBlock *BBLatch = BBL->getLoopLatch();
  4136. if (!BBLatch)
  4137. return nullptr;
  4138. // There is a loop latch, return the incoming value if it comes from
  4139. // that. This reduction pattern occasionally turns up.
  4140. if (P->getIncomingBlock(0) == BBLatch) {
  4141. Rdx = P->getIncomingValue(0);
  4142. } else if (P->getIncomingBlock(1) == BBLatch) {
  4143. Rdx = P->getIncomingValue(1);
  4144. }
  4145. if (Rdx && DominatedReduxValue(Rdx))
  4146. return Rdx;
  4147. return nullptr;
  4148. }
  4149. /// Attempt to reduce a horizontal reduction.
  4150. /// If it is legal to match a horizontal reduction feeding the phi node \a P
  4151. /// with reduction operators \a Root (or one of its operands) in a basic block
  4152. /// \a BB, then check if it can be done. If horizontal reduction is not found
  4153. /// and root instruction is a binary operation, vectorization of the operands is
  4154. /// attempted.
  4155. /// \returns true if a horizontal reduction was matched and reduced or operands
  4156. /// of one of the binary instruction were vectorized.
  4157. /// \returns false if a horizontal reduction was not matched (or not possible)
  4158. /// or no vectorization of any binary operation feeding \a Root instruction was
  4159. /// performed.
  4160. static bool tryToVectorizeHorReductionOrInstOperands(
  4161. PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
  4162. TargetTransformInfo *TTI,
  4163. const function_ref<bool(BinaryOperator *, BoUpSLP &)> Vectorize) {
  4164. if (!ShouldVectorizeHor)
  4165. return false;
  4166. if (!Root)
  4167. return false;
  4168. if (Root->getParent() != BB || isa<PHINode>(Root))
  4169. return false;
  4170. // Start analysis starting from Root instruction. If horizontal reduction is
  4171. // found, try to vectorize it. If it is not a horizontal reduction or
  4172. // vectorization is not possible or not effective, and currently analyzed
  4173. // instruction is a binary operation, try to vectorize the operands, using
  4174. // pre-order DFS traversal order. If the operands were not vectorized, repeat
  4175. // the same procedure considering each operand as a possible root of the
  4176. // horizontal reduction.
  4177. // Interrupt the process if the Root instruction itself was vectorized or all
  4178. // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
  4179. SmallVector<std::pair<WeakTrackingVH, unsigned>, 8> Stack(1, {Root, 0});
  4180. SmallSet<Value *, 8> VisitedInstrs;
  4181. bool Res = false;
  4182. while (!Stack.empty()) {
  4183. Value *V;
  4184. unsigned Level;
  4185. std::tie(V, Level) = Stack.pop_back_val();
  4186. if (!V)
  4187. continue;
  4188. auto *Inst = dyn_cast<Instruction>(V);
  4189. if (!Inst)
  4190. continue;
  4191. if (auto *BI = dyn_cast<BinaryOperator>(Inst)) {
  4192. HorizontalReduction HorRdx;
  4193. if (HorRdx.matchAssociativeReduction(P, BI)) {
  4194. if (HorRdx.tryToReduce(R, TTI)) {
  4195. Res = true;
  4196. // Set P to nullptr to avoid re-analysis of phi node in
  4197. // matchAssociativeReduction function unless this is the root node.
  4198. P = nullptr;
  4199. continue;
  4200. }
  4201. }
  4202. if (P) {
  4203. Inst = dyn_cast<Instruction>(BI->getOperand(0));
  4204. if (Inst == P)
  4205. Inst = dyn_cast<Instruction>(BI->getOperand(1));
  4206. if (!Inst) {
  4207. // Set P to nullptr to avoid re-analysis of phi node in
  4208. // matchAssociativeReduction function unless this is the root node.
  4209. P = nullptr;
  4210. continue;
  4211. }
  4212. }
  4213. }
  4214. // Set P to nullptr to avoid re-analysis of phi node in
  4215. // matchAssociativeReduction function unless this is the root node.
  4216. P = nullptr;
  4217. if (Vectorize(dyn_cast<BinaryOperator>(Inst), R)) {
  4218. Res = true;
  4219. continue;
  4220. }
  4221. // Try to vectorize operands.
  4222. // Continue analysis for the instruction from the same basic block only to
  4223. // save compile time.
  4224. if (++Level < RecursionMaxDepth)
  4225. for (auto *Op : Inst->operand_values())
  4226. if (VisitedInstrs.insert(Op).second)
  4227. if (auto *I = dyn_cast<Instruction>(Op))
  4228. if (!isa<PHINode>(Inst) && I->getParent() == BB)
  4229. Stack.emplace_back(Op, Level);
  4230. }
  4231. return Res;
  4232. }
  4233. bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
  4234. BasicBlock *BB, BoUpSLP &R,
  4235. TargetTransformInfo *TTI) {
  4236. if (!V)
  4237. return false;
  4238. auto *I = dyn_cast<Instruction>(V);
  4239. if (!I)
  4240. return false;
  4241. if (!isa<BinaryOperator>(I))
  4242. P = nullptr;
  4243. // Try to match and vectorize a horizontal reduction.
  4244. return tryToVectorizeHorReductionOrInstOperands(
  4245. P, I, BB, R, TTI, [this](BinaryOperator *BI, BoUpSLP &R) -> bool {
  4246. return tryToVectorize(BI, R);
  4247. });
  4248. }
  4249. bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  4250. bool Changed = false;
  4251. SmallVector<Value *, 4> Incoming;
  4252. SmallSet<Value *, 16> VisitedInstrs;
  4253. bool HaveVectorizedPhiNodes = true;
  4254. while (HaveVectorizedPhiNodes) {
  4255. HaveVectorizedPhiNodes = false;
  4256. // Collect the incoming values from the PHIs.
  4257. Incoming.clear();
  4258. for (Instruction &I : *BB) {
  4259. PHINode *P = dyn_cast<PHINode>(&I);
  4260. if (!P)
  4261. break;
  4262. if (!VisitedInstrs.count(P))
  4263. Incoming.push_back(P);
  4264. }
  4265. // Sort by type.
  4266. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
  4267. // Try to vectorize elements base on their type.
  4268. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
  4269. E = Incoming.end();
  4270. IncIt != E;) {
  4271. // Look for the next elements with the same type.
  4272. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
  4273. while (SameTypeIt != E &&
  4274. (*SameTypeIt)->getType() == (*IncIt)->getType()) {
  4275. VisitedInstrs.insert(*SameTypeIt);
  4276. ++SameTypeIt;
  4277. }
  4278. // Try to vectorize them.
  4279. unsigned NumElts = (SameTypeIt - IncIt);
  4280. DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
  4281. // The order in which the phi nodes appear in the program does not matter.
  4282. // So allow tryToVectorizeList to reorder them if it is beneficial. This
  4283. // is done when there are exactly two elements since tryToVectorizeList
  4284. // asserts that there are only two values when AllowReorder is true.
  4285. bool AllowReorder = NumElts == 2;
  4286. if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
  4287. None, AllowReorder)) {
  4288. // Success start over because instructions might have been changed.
  4289. HaveVectorizedPhiNodes = true;
  4290. Changed = true;
  4291. break;
  4292. }
  4293. // Start over at the next instruction of a different type (or the end).
  4294. IncIt = SameTypeIt;
  4295. }
  4296. }
  4297. VisitedInstrs.clear();
  4298. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
  4299. // We may go through BB multiple times so skip the one we have checked.
  4300. if (!VisitedInstrs.insert(&*it).second)
  4301. continue;
  4302. if (isa<DbgInfoIntrinsic>(it))
  4303. continue;
  4304. // Try to vectorize reductions that use PHINodes.
  4305. if (PHINode *P = dyn_cast<PHINode>(it)) {
  4306. // Check that the PHI is a reduction PHI.
  4307. if (P->getNumIncomingValues() != 2)
  4308. return Changed;
  4309. // Try to match and vectorize a horizontal reduction.
  4310. if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
  4311. TTI)) {
  4312. Changed = true;
  4313. it = BB->begin();
  4314. e = BB->end();
  4315. continue;
  4316. }
  4317. continue;
  4318. }
  4319. if (ShouldStartVectorizeHorAtStore) {
  4320. if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
  4321. // Try to match and vectorize a horizontal reduction.
  4322. if (vectorizeRootInstruction(nullptr, SI->getValueOperand(), BB, R,
  4323. TTI)) {
  4324. Changed = true;
  4325. it = BB->begin();
  4326. e = BB->end();
  4327. continue;
  4328. }
  4329. }
  4330. }
  4331. // Try to vectorize horizontal reductions feeding into a return.
  4332. if (ReturnInst *RI = dyn_cast<ReturnInst>(it)) {
  4333. if (RI->getNumOperands() != 0) {
  4334. // Try to match and vectorize a horizontal reduction.
  4335. if (vectorizeRootInstruction(nullptr, RI->getOperand(0), BB, R, TTI)) {
  4336. Changed = true;
  4337. it = BB->begin();
  4338. e = BB->end();
  4339. continue;
  4340. }
  4341. }
  4342. }
  4343. // Try to vectorize trees that start at compare instructions.
  4344. if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
  4345. if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
  4346. Changed = true;
  4347. // We would like to start over since some instructions are deleted
  4348. // and the iterator may become invalid value.
  4349. it = BB->begin();
  4350. e = BB->end();
  4351. continue;
  4352. }
  4353. for (int I = 0; I < 2; ++I) {
  4354. if (vectorizeRootInstruction(nullptr, CI->getOperand(I), BB, R, TTI)) {
  4355. Changed = true;
  4356. // We would like to start over since some instructions are deleted
  4357. // and the iterator may become invalid value.
  4358. it = BB->begin();
  4359. e = BB->end();
  4360. break;
  4361. }
  4362. }
  4363. continue;
  4364. }
  4365. // Try to vectorize trees that start at insertelement instructions.
  4366. if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
  4367. SmallVector<Value *, 16> BuildVector;
  4368. SmallVector<Value *, 16> BuildVectorOpds;
  4369. if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
  4370. continue;
  4371. // Vectorize starting with the build vector operands ignoring the
  4372. // BuildVector instructions for the purpose of scheduling and user
  4373. // extraction.
  4374. if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
  4375. Changed = true;
  4376. it = BB->begin();
  4377. e = BB->end();
  4378. }
  4379. continue;
  4380. }
  4381. // Try to vectorize trees that start at insertvalue instructions feeding into
  4382. // a store.
  4383. if (StoreInst *SI = dyn_cast<StoreInst>(it)) {
  4384. if (InsertValueInst *LastInsertValue = dyn_cast<InsertValueInst>(SI->getValueOperand())) {
  4385. const DataLayout &DL = BB->getModule()->getDataLayout();
  4386. if (R.canMapToVector(SI->getValueOperand()->getType(), DL)) {
  4387. SmallVector<Value *, 16> BuildVector;
  4388. SmallVector<Value *, 16> BuildVectorOpds;
  4389. if (!findBuildAggregate(LastInsertValue, BuildVector, BuildVectorOpds))
  4390. continue;
  4391. DEBUG(dbgs() << "SLP: store of array mappable to vector: " << *SI << "\n");
  4392. if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false)) {
  4393. Changed = true;
  4394. it = BB->begin();
  4395. e = BB->end();
  4396. }
  4397. continue;
  4398. }
  4399. }
  4400. }
  4401. }
  4402. return Changed;
  4403. }
  4404. bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
  4405. auto Changed = false;
  4406. for (auto &Entry : GEPs) {
  4407. // If the getelementptr list has fewer than two elements, there's nothing
  4408. // to do.
  4409. if (Entry.second.size() < 2)
  4410. continue;
  4411. DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
  4412. << Entry.second.size() << ".\n");
  4413. // We process the getelementptr list in chunks of 16 (like we do for
  4414. // stores) to minimize compile-time.
  4415. for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
  4416. auto Len = std::min<unsigned>(BE - BI, 16);
  4417. auto GEPList = makeArrayRef(&Entry.second[BI], Len);
  4418. // Initialize a set a candidate getelementptrs. Note that we use a
  4419. // SetVector here to preserve program order. If the index computations
  4420. // are vectorizable and begin with loads, we want to minimize the chance
  4421. // of having to reorder them later.
  4422. SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
  4423. // Some of the candidates may have already been vectorized after we
  4424. // initially collected them. If so, the WeakTrackingVHs will have
  4425. // nullified the
  4426. // values, so remove them from the set of candidates.
  4427. Candidates.remove(nullptr);
  4428. // Remove from the set of candidates all pairs of getelementptrs with
  4429. // constant differences. Such getelementptrs are likely not good
  4430. // candidates for vectorization in a bottom-up phase since one can be
  4431. // computed from the other. We also ensure all candidate getelementptr
  4432. // indices are unique.
  4433. for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
  4434. auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
  4435. if (!Candidates.count(GEPI))
  4436. continue;
  4437. auto *SCEVI = SE->getSCEV(GEPList[I]);
  4438. for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
  4439. auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
  4440. auto *SCEVJ = SE->getSCEV(GEPList[J]);
  4441. if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
  4442. Candidates.remove(GEPList[I]);
  4443. Candidates.remove(GEPList[J]);
  4444. } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
  4445. Candidates.remove(GEPList[J]);
  4446. }
  4447. }
  4448. }
  4449. // We break out of the above computation as soon as we know there are
  4450. // fewer than two candidates remaining.
  4451. if (Candidates.size() < 2)
  4452. continue;
  4453. // Add the single, non-constant index of each candidate to the bundle. We
  4454. // ensured the indices met these constraints when we originally collected
  4455. // the getelementptrs.
  4456. SmallVector<Value *, 16> Bundle(Candidates.size());
  4457. auto BundleIndex = 0u;
  4458. for (auto *V : Candidates) {
  4459. auto *GEP = cast<GetElementPtrInst>(V);
  4460. auto *GEPIdx = GEP->idx_begin()->get();
  4461. assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
  4462. Bundle[BundleIndex++] = GEPIdx;
  4463. }
  4464. // Try and vectorize the indices. We are currently only interested in
  4465. // gather-like cases of the form:
  4466. //
  4467. // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
  4468. //
  4469. // where the loads of "a", the loads of "b", and the subtractions can be
  4470. // performed in parallel. It's likely that detecting this pattern in a
  4471. // bottom-up phase will be simpler and less costly than building a
  4472. // full-blown top-down phase beginning at the consecutive loads.
  4473. Changed |= tryToVectorizeList(Bundle, R);
  4474. }
  4475. }
  4476. return Changed;
  4477. }
  4478. bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
  4479. bool Changed = false;
  4480. // Attempt to sort and vectorize each of the store-groups.
  4481. for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
  4482. ++it) {
  4483. if (it->second.size() < 2)
  4484. continue;
  4485. DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
  4486. << it->second.size() << ".\n");
  4487. // Process the stores in chunks of 16.
  4488. // TODO: The limit of 16 inhibits greater vectorization factors.
  4489. // For example, AVX2 supports v32i8. Increasing this limit, however,
  4490. // may cause a significant compile-time increase.
  4491. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
  4492. unsigned Len = std::min<unsigned>(CE - CI, 16);
  4493. Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len), R);
  4494. }
  4495. }
  4496. return Changed;
  4497. }
  4498. char SLPVectorizer::ID = 0;
  4499. static const char lv_name[] = "SLP Vectorizer";
  4500. INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  4501. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  4502. INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
  4503. INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
  4504. INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
  4505. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  4506. INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
  4507. INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
  4508. INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
  4509. namespace llvm {
  4510. Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
  4511. }