SLPVectorizer.cpp 87 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669
  1. //===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. // This pass implements the Bottom Up SLP vectorizer. It detects consecutive
  10. // stores that can be put together into vector-stores. Next, it attempts to
  11. // construct vectorizable tree using the use-def chains. If a profitable tree
  12. // was found, the SLP vectorizer performs vectorization on the tree.
  13. //
  14. // The pass is inspired by the work described in the paper:
  15. // "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
  16. //
  17. //===----------------------------------------------------------------------===//
  18. #define SV_NAME "slp-vectorizer"
  19. #define DEBUG_TYPE "SLP"
  20. #include "llvm/Transforms/Vectorize.h"
  21. #include "llvm/ADT/MapVector.h"
  22. #include "llvm/ADT/PostOrderIterator.h"
  23. #include "llvm/ADT/SetVector.h"
  24. #include "llvm/Analysis/AliasAnalysis.h"
  25. #include "llvm/Analysis/LoopInfo.h"
  26. #include "llvm/Analysis/ScalarEvolution.h"
  27. #include "llvm/Analysis/ScalarEvolutionExpressions.h"
  28. #include "llvm/Analysis/TargetTransformInfo.h"
  29. #include "llvm/Analysis/ValueTracking.h"
  30. #include "llvm/IR/DataLayout.h"
  31. #include "llvm/IR/Dominators.h"
  32. #include "llvm/IR/IRBuilder.h"
  33. #include "llvm/IR/Instructions.h"
  34. #include "llvm/IR/IntrinsicInst.h"
  35. #include "llvm/IR/Module.h"
  36. #include "llvm/IR/Type.h"
  37. #include "llvm/IR/Value.h"
  38. #include "llvm/IR/Verifier.h"
  39. #include "llvm/Pass.h"
  40. #include "llvm/Support/CommandLine.h"
  41. #include "llvm/Support/Debug.h"
  42. #include "llvm/Support/raw_ostream.h"
  43. #include <algorithm>
  44. #include <map>
  45. using namespace llvm;
  46. static cl::opt<int>
  47. SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
  48. cl::desc("Only vectorize if you gain more than this "
  49. "number "));
  50. static cl::opt<bool>
  51. ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
  52. cl::desc("Attempt to vectorize horizontal reductions"));
  53. static cl::opt<bool> ShouldStartVectorizeHorAtStore(
  54. "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
  55. cl::desc(
  56. "Attempt to vectorize horizontal reductions feeding into a store"));
  57. namespace {
  58. static const unsigned MinVecRegSize = 128;
  59. static const unsigned RecursionMaxDepth = 12;
  60. /// A helper class for numbering instructions in multiple blocks.
  61. /// Numbers start at zero for each basic block.
  62. struct BlockNumbering {
  63. BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
  64. BlockNumbering() : BB(0), Valid(false) {}
  65. void numberInstructions() {
  66. unsigned Loc = 0;
  67. InstrIdx.clear();
  68. InstrVec.clear();
  69. // Number the instructions in the block.
  70. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  71. InstrIdx[it] = Loc++;
  72. InstrVec.push_back(it);
  73. assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
  74. }
  75. Valid = true;
  76. }
  77. int getIndex(Instruction *I) {
  78. assert(I->getParent() == BB && "Invalid instruction");
  79. if (!Valid)
  80. numberInstructions();
  81. assert(InstrIdx.count(I) && "Unknown instruction");
  82. return InstrIdx[I];
  83. }
  84. Instruction *getInstruction(unsigned loc) {
  85. if (!Valid)
  86. numberInstructions();
  87. assert(InstrVec.size() > loc && "Invalid Index");
  88. return InstrVec[loc];
  89. }
  90. void forget() { Valid = false; }
  91. private:
  92. /// The block we are numbering.
  93. BasicBlock *BB;
  94. /// Is the block numbered.
  95. bool Valid;
  96. /// Maps instructions to numbers and back.
  97. SmallDenseMap<Instruction *, int> InstrIdx;
  98. /// Maps integers to Instructions.
  99. SmallVector<Instruction *, 32> InstrVec;
  100. };
  101. /// \returns the parent basic block if all of the instructions in \p VL
  102. /// are in the same block or null otherwise.
  103. static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
  104. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  105. if (!I0)
  106. return 0;
  107. BasicBlock *BB = I0->getParent();
  108. for (int i = 1, e = VL.size(); i < e; i++) {
  109. Instruction *I = dyn_cast<Instruction>(VL[i]);
  110. if (!I)
  111. return 0;
  112. if (BB != I->getParent())
  113. return 0;
  114. }
  115. return BB;
  116. }
  117. /// \returns True if all of the values in \p VL are constants.
  118. static bool allConstant(ArrayRef<Value *> VL) {
  119. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  120. if (!isa<Constant>(VL[i]))
  121. return false;
  122. return true;
  123. }
  124. /// \returns True if all of the values in \p VL are identical.
  125. static bool isSplat(ArrayRef<Value *> VL) {
  126. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  127. if (VL[i] != VL[0])
  128. return false;
  129. return true;
  130. }
  131. /// \returns The opcode if all of the Instructions in \p VL have the same
  132. /// opcode, or zero.
  133. static unsigned getSameOpcode(ArrayRef<Value *> VL) {
  134. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  135. if (!I0)
  136. return 0;
  137. unsigned Opcode = I0->getOpcode();
  138. for (int i = 1, e = VL.size(); i < e; i++) {
  139. Instruction *I = dyn_cast<Instruction>(VL[i]);
  140. if (!I || Opcode != I->getOpcode())
  141. return 0;
  142. }
  143. return Opcode;
  144. }
  145. /// \returns \p I after propagating metadata from \p VL.
  146. static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
  147. Instruction *I0 = cast<Instruction>(VL[0]);
  148. SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
  149. I0->getAllMetadataOtherThanDebugLoc(Metadata);
  150. for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
  151. unsigned Kind = Metadata[i].first;
  152. MDNode *MD = Metadata[i].second;
  153. for (int i = 1, e = VL.size(); MD && i != e; i++) {
  154. Instruction *I = cast<Instruction>(VL[i]);
  155. MDNode *IMD = I->getMetadata(Kind);
  156. switch (Kind) {
  157. default:
  158. MD = 0; // Remove unknown metadata
  159. break;
  160. case LLVMContext::MD_tbaa:
  161. MD = MDNode::getMostGenericTBAA(MD, IMD);
  162. break;
  163. case LLVMContext::MD_fpmath:
  164. MD = MDNode::getMostGenericFPMath(MD, IMD);
  165. break;
  166. }
  167. }
  168. I->setMetadata(Kind, MD);
  169. }
  170. return I;
  171. }
  172. /// \returns The type that all of the values in \p VL have or null if there
  173. /// are different types.
  174. static Type* getSameType(ArrayRef<Value *> VL) {
  175. Type *Ty = VL[0]->getType();
  176. for (int i = 1, e = VL.size(); i < e; i++)
  177. if (VL[i]->getType() != Ty)
  178. return 0;
  179. return Ty;
  180. }
  181. /// \returns True if the ExtractElement instructions in VL can be vectorized
  182. /// to use the original vector.
  183. static bool CanReuseExtract(ArrayRef<Value *> VL) {
  184. assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
  185. // Check if all of the extracts come from the same vector and from the
  186. // correct offset.
  187. Value *VL0 = VL[0];
  188. ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
  189. Value *Vec = E0->getOperand(0);
  190. // We have to extract from the same vector type.
  191. unsigned NElts = Vec->getType()->getVectorNumElements();
  192. if (NElts != VL.size())
  193. return false;
  194. // Check that all of the indices extract from the correct offset.
  195. ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
  196. if (!CI || CI->getZExtValue())
  197. return false;
  198. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  199. ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
  200. ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
  201. if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
  202. return false;
  203. }
  204. return true;
  205. }
  206. static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
  207. SmallVectorImpl<Value *> &Left,
  208. SmallVectorImpl<Value *> &Right) {
  209. SmallVector<Value *, 16> OrigLeft, OrigRight;
  210. bool AllSameOpcodeLeft = true;
  211. bool AllSameOpcodeRight = true;
  212. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  213. Instruction *I = cast<Instruction>(VL[i]);
  214. Value *V0 = I->getOperand(0);
  215. Value *V1 = I->getOperand(1);
  216. OrigLeft.push_back(V0);
  217. OrigRight.push_back(V1);
  218. Instruction *I0 = dyn_cast<Instruction>(V0);
  219. Instruction *I1 = dyn_cast<Instruction>(V1);
  220. // Check whether all operands on one side have the same opcode. In this case
  221. // we want to preserve the original order and not make things worse by
  222. // reordering.
  223. AllSameOpcodeLeft = I0;
  224. AllSameOpcodeRight = I1;
  225. if (i && AllSameOpcodeLeft) {
  226. if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
  227. if(P0->getOpcode() != I0->getOpcode())
  228. AllSameOpcodeLeft = false;
  229. } else
  230. AllSameOpcodeLeft = false;
  231. }
  232. if (i && AllSameOpcodeRight) {
  233. if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
  234. if(P1->getOpcode() != I1->getOpcode())
  235. AllSameOpcodeRight = false;
  236. } else
  237. AllSameOpcodeRight = false;
  238. }
  239. // Sort two opcodes. In the code below we try to preserve the ability to use
  240. // broadcast of values instead of individual inserts.
  241. // vl1 = load
  242. // vl2 = phi
  243. // vr1 = load
  244. // vr2 = vr2
  245. // = vl1 x vr1
  246. // = vl2 x vr2
  247. // If we just sorted according to opcode we would leave the first line in
  248. // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
  249. // = vl1 x vr1
  250. // = vr2 x vl2
  251. // Because vr2 and vr1 are from the same load we loose the opportunity of a
  252. // broadcast for the packed right side in the backend: we have [vr1, vl2]
  253. // instead of [vr1, vr2=vr1].
  254. if (I0 && I1) {
  255. if(!i && I0->getOpcode() > I1->getOpcode()) {
  256. Left.push_back(I1);
  257. Right.push_back(I0);
  258. } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
  259. // Try not to destroy a broad cast for no apparent benefit.
  260. Left.push_back(I1);
  261. Right.push_back(I0);
  262. } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] == I0) {
  263. // Try preserve broadcasts.
  264. Left.push_back(I1);
  265. Right.push_back(I0);
  266. } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
  267. // Try preserve broadcasts.
  268. Left.push_back(I1);
  269. Right.push_back(I0);
  270. } else {
  271. Left.push_back(I0);
  272. Right.push_back(I1);
  273. }
  274. continue;
  275. }
  276. // One opcode, put the instruction on the right.
  277. if (I0) {
  278. Left.push_back(V1);
  279. Right.push_back(I0);
  280. continue;
  281. }
  282. Left.push_back(V0);
  283. Right.push_back(V1);
  284. }
  285. bool LeftBroadcast = isSplat(Left);
  286. bool RightBroadcast = isSplat(Right);
  287. // Don't reorder if the operands where good to begin with.
  288. if (!(LeftBroadcast || RightBroadcast) &&
  289. (AllSameOpcodeRight || AllSameOpcodeLeft)) {
  290. Left = OrigLeft;
  291. Right = OrigRight;
  292. }
  293. }
  294. /// Bottom Up SLP Vectorizer.
  295. class BoUpSLP {
  296. public:
  297. typedef SmallVector<Value *, 8> ValueList;
  298. typedef SmallVector<Instruction *, 16> InstrList;
  299. typedef SmallPtrSet<Value *, 16> ValueSet;
  300. typedef SmallVector<StoreInst *, 8> StoreList;
  301. BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
  302. TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
  303. DominatorTree *Dt) :
  304. F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
  305. Builder(Se->getContext()) {
  306. // Setup the block numbering utility for all of the blocks in the
  307. // function.
  308. for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
  309. BasicBlock *BB = it;
  310. BlocksNumbers[BB] = BlockNumbering(BB);
  311. }
  312. }
  313. /// \brief Vectorize the tree that starts with the elements in \p VL.
  314. /// Returns the vectorized root.
  315. Value *vectorizeTree();
  316. /// \returns the vectorization cost of the subtree that starts at \p VL.
  317. /// A negative number means that this is profitable.
  318. int getTreeCost();
  319. /// Construct a vectorizable tree that starts at \p Roots and is possibly
  320. /// used by a reduction of \p RdxOps.
  321. void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
  322. /// Clear the internal data structures that are created by 'buildTree'.
  323. void deleteTree() {
  324. RdxOps = 0;
  325. VectorizableTree.clear();
  326. ScalarToTreeEntry.clear();
  327. MustGather.clear();
  328. ExternalUses.clear();
  329. MemBarrierIgnoreList.clear();
  330. }
  331. /// \returns true if the memory operations A and B are consecutive.
  332. bool isConsecutiveAccess(Value *A, Value *B);
  333. /// \brief Perform LICM and CSE on the newly generated gather sequences.
  334. void optimizeGatherSequence();
  335. private:
  336. struct TreeEntry;
  337. /// \returns the cost of the vectorizable entry.
  338. int getEntryCost(TreeEntry *E);
  339. /// This is the recursive part of buildTree.
  340. void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
  341. /// Vectorize a single entry in the tree.
  342. Value *vectorizeTree(TreeEntry *E);
  343. /// Vectorize a single entry in the tree, starting in \p VL.
  344. Value *vectorizeTree(ArrayRef<Value *> VL);
  345. /// \returns the pointer to the vectorized value if \p VL is already
  346. /// vectorized, or NULL. They may happen in cycles.
  347. Value *alreadyVectorized(ArrayRef<Value *> VL) const;
  348. /// \brief Take the pointer operand from the Load/Store instruction.
  349. /// \returns NULL if this is not a valid Load/Store instruction.
  350. static Value *getPointerOperand(Value *I);
  351. /// \brief Take the address space operand from the Load/Store instruction.
  352. /// \returns -1 if this is not a valid Load/Store instruction.
  353. static unsigned getAddressSpaceOperand(Value *I);
  354. /// \returns the scalarization cost for this type. Scalarization in this
  355. /// context means the creation of vectors from a group of scalars.
  356. int getGatherCost(Type *Ty);
  357. /// \returns the scalarization cost for this list of values. Assuming that
  358. /// this subtree gets vectorized, we may need to extract the values from the
  359. /// roots. This method calculates the cost of extracting the values.
  360. int getGatherCost(ArrayRef<Value *> VL);
  361. /// \returns the AA location that is being access by the instruction.
  362. AliasAnalysis::Location getLocation(Instruction *I);
  363. /// \brief Checks if it is possible to sink an instruction from
  364. /// \p Src to \p Dst.
  365. /// \returns the pointer to the barrier instruction if we can't sink.
  366. Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
  367. /// \returns the index of the last instruction in the BB from \p VL.
  368. int getLastIndex(ArrayRef<Value *> VL);
  369. /// \returns the Instruction in the bundle \p VL.
  370. Instruction *getLastInstruction(ArrayRef<Value *> VL);
  371. /// \brief Set the Builder insert point to one after the last instruction in
  372. /// the bundle
  373. void setInsertPointAfterBundle(ArrayRef<Value *> VL);
  374. /// \returns a vector from a collection of scalars in \p VL.
  375. Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
  376. /// \returns whether the VectorizableTree is fully vectoriable and will
  377. /// be beneficial even the tree height is tiny.
  378. bool isFullyVectorizableTinyTree();
  379. struct TreeEntry {
  380. TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
  381. NeedToGather(0) {}
  382. /// \returns true if the scalars in VL are equal to this entry.
  383. bool isSame(ArrayRef<Value *> VL) const {
  384. assert(VL.size() == Scalars.size() && "Invalid size");
  385. return std::equal(VL.begin(), VL.end(), Scalars.begin());
  386. }
  387. /// A vector of scalars.
  388. ValueList Scalars;
  389. /// The Scalars are vectorized into this value. It is initialized to Null.
  390. Value *VectorizedValue;
  391. /// The index in the basic block of the last scalar.
  392. int LastScalarIndex;
  393. /// Do we need to gather this sequence ?
  394. bool NeedToGather;
  395. };
  396. /// Create a new VectorizableTree entry.
  397. TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
  398. VectorizableTree.push_back(TreeEntry());
  399. int idx = VectorizableTree.size() - 1;
  400. TreeEntry *Last = &VectorizableTree[idx];
  401. Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
  402. Last->NeedToGather = !Vectorized;
  403. if (Vectorized) {
  404. Last->LastScalarIndex = getLastIndex(VL);
  405. for (int i = 0, e = VL.size(); i != e; ++i) {
  406. assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
  407. ScalarToTreeEntry[VL[i]] = idx;
  408. }
  409. } else {
  410. Last->LastScalarIndex = 0;
  411. MustGather.insert(VL.begin(), VL.end());
  412. }
  413. return Last;
  414. }
  415. /// -- Vectorization State --
  416. /// Holds all of the tree entries.
  417. std::vector<TreeEntry> VectorizableTree;
  418. /// Maps a specific scalar to its tree entry.
  419. SmallDenseMap<Value*, int> ScalarToTreeEntry;
  420. /// A list of scalars that we found that we need to keep as scalars.
  421. ValueSet MustGather;
  422. /// This POD struct describes one external user in the vectorized tree.
  423. struct ExternalUser {
  424. ExternalUser (Value *S, llvm::User *U, int L) :
  425. Scalar(S), User(U), Lane(L){};
  426. // Which scalar in our function.
  427. Value *Scalar;
  428. // Which user that uses the scalar.
  429. llvm::User *User;
  430. // Which lane does the scalar belong to.
  431. int Lane;
  432. };
  433. typedef SmallVector<ExternalUser, 16> UserList;
  434. /// A list of values that need to extracted out of the tree.
  435. /// This list holds pairs of (Internal Scalar : External User).
  436. UserList ExternalUses;
  437. /// A list of instructions to ignore while sinking
  438. /// memory instructions. This map must be reset between runs of getCost.
  439. ValueSet MemBarrierIgnoreList;
  440. /// Holds all of the instructions that we gathered.
  441. SetVector<Instruction *> GatherSeq;
  442. /// A list of blocks that we are going to CSE.
  443. SetVector<BasicBlock *> CSEBlocks;
  444. /// Numbers instructions in different blocks.
  445. DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
  446. /// Reduction operators.
  447. ValueSet *RdxOps;
  448. // Analysis and block reference.
  449. Function *F;
  450. ScalarEvolution *SE;
  451. DataLayout *DL;
  452. TargetTransformInfo *TTI;
  453. AliasAnalysis *AA;
  454. LoopInfo *LI;
  455. DominatorTree *DT;
  456. /// Instruction builder to construct the vectorized tree.
  457. IRBuilder<> Builder;
  458. };
  459. void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
  460. deleteTree();
  461. RdxOps = Rdx;
  462. if (!getSameType(Roots))
  463. return;
  464. buildTree_rec(Roots, 0);
  465. // Collect the values that we need to extract from the tree.
  466. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  467. TreeEntry *Entry = &VectorizableTree[EIdx];
  468. // For each lane:
  469. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  470. Value *Scalar = Entry->Scalars[Lane];
  471. // No need to handle users of gathered values.
  472. if (Entry->NeedToGather)
  473. continue;
  474. for (Value::use_iterator User = Scalar->use_begin(),
  475. UE = Scalar->use_end(); User != UE; ++User) {
  476. DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
  477. // Skip in-tree scalars that become vectors.
  478. if (ScalarToTreeEntry.count(*User)) {
  479. DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
  480. **User << ".\n");
  481. int Idx = ScalarToTreeEntry[*User]; (void) Idx;
  482. assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
  483. continue;
  484. }
  485. Instruction *UserInst = dyn_cast<Instruction>(*User);
  486. if (!UserInst)
  487. continue;
  488. // Ignore uses that are part of the reduction.
  489. if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
  490. continue;
  491. DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
  492. Lane << " from " << *Scalar << ".\n");
  493. ExternalUses.push_back(ExternalUser(Scalar, *User, Lane));
  494. }
  495. }
  496. }
  497. }
  498. void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
  499. bool SameTy = getSameType(VL); (void)SameTy;
  500. assert(SameTy && "Invalid types!");
  501. if (Depth == RecursionMaxDepth) {
  502. DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
  503. newTreeEntry(VL, false);
  504. return;
  505. }
  506. // Don't handle vectors.
  507. if (VL[0]->getType()->isVectorTy()) {
  508. DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
  509. newTreeEntry(VL, false);
  510. return;
  511. }
  512. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  513. if (SI->getValueOperand()->getType()->isVectorTy()) {
  514. DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
  515. newTreeEntry(VL, false);
  516. return;
  517. }
  518. // If all of the operands are identical or constant we have a simple solution.
  519. if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
  520. !getSameOpcode(VL)) {
  521. DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
  522. newTreeEntry(VL, false);
  523. return;
  524. }
  525. // We now know that this is a vector of instructions of the same type from
  526. // the same block.
  527. // Check if this is a duplicate of another entry.
  528. if (ScalarToTreeEntry.count(VL[0])) {
  529. int Idx = ScalarToTreeEntry[VL[0]];
  530. TreeEntry *E = &VectorizableTree[Idx];
  531. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  532. DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
  533. if (E->Scalars[i] != VL[i]) {
  534. DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
  535. newTreeEntry(VL, false);
  536. return;
  537. }
  538. }
  539. DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
  540. return;
  541. }
  542. // Check that none of the instructions in the bundle are already in the tree.
  543. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  544. if (ScalarToTreeEntry.count(VL[i])) {
  545. DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
  546. ") is already in tree.\n");
  547. newTreeEntry(VL, false);
  548. return;
  549. }
  550. }
  551. // If any of the scalars appears in the table OR it is marked as a value that
  552. // needs to stat scalar then we need to gather the scalars.
  553. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  554. if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
  555. DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
  556. newTreeEntry(VL, false);
  557. return;
  558. }
  559. }
  560. // Check that all of the users of the scalars that we want to vectorize are
  561. // schedulable.
  562. Instruction *VL0 = cast<Instruction>(VL[0]);
  563. int MyLastIndex = getLastIndex(VL);
  564. BasicBlock *BB = cast<Instruction>(VL0)->getParent();
  565. for (unsigned i = 0, e = VL.size(); i != e; ++i) {
  566. Instruction *Scalar = cast<Instruction>(VL[i]);
  567. DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n");
  568. for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
  569. U != UE; ++U) {
  570. DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n");
  571. Instruction *User = dyn_cast<Instruction>(*U);
  572. if (!User) {
  573. DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
  574. newTreeEntry(VL, false);
  575. return;
  576. }
  577. // We don't care if the user is in a different basic block.
  578. BasicBlock *UserBlock = User->getParent();
  579. if (UserBlock != BB) {
  580. DEBUG(dbgs() << "SLP: User from a different basic block "
  581. << *User << ". \n");
  582. continue;
  583. }
  584. // If this is a PHINode within this basic block then we can place the
  585. // extract wherever we want.
  586. if (isa<PHINode>(*User)) {
  587. DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n");
  588. continue;
  589. }
  590. // Check if this is a safe in-tree user.
  591. if (ScalarToTreeEntry.count(User)) {
  592. int Idx = ScalarToTreeEntry[User];
  593. int VecLocation = VectorizableTree[Idx].LastScalarIndex;
  594. if (VecLocation <= MyLastIndex) {
  595. DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
  596. newTreeEntry(VL, false);
  597. return;
  598. }
  599. DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" <<
  600. VecLocation << " vector value (" << *Scalar << ") at #"
  601. << MyLastIndex << ".\n");
  602. continue;
  603. }
  604. // This user is part of the reduction.
  605. if (RdxOps && RdxOps->count(User))
  606. continue;
  607. // Make sure that we can schedule this unknown user.
  608. BlockNumbering &BN = BlocksNumbers[BB];
  609. int UserIndex = BN.getIndex(User);
  610. if (UserIndex < MyLastIndex) {
  611. DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
  612. << *User << ". \n");
  613. newTreeEntry(VL, false);
  614. return;
  615. }
  616. }
  617. }
  618. // Check that every instructions appears once in this bundle.
  619. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  620. for (unsigned j = i+1; j < e; ++j)
  621. if (VL[i] == VL[j]) {
  622. DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
  623. newTreeEntry(VL, false);
  624. return;
  625. }
  626. // Check that instructions in this bundle don't reference other instructions.
  627. // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
  628. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  629. for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
  630. U != UE; ++U) {
  631. for (unsigned j = 0; j < e; ++j) {
  632. if (i != j && *U == VL[j]) {
  633. DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n");
  634. newTreeEntry(VL, false);
  635. return;
  636. }
  637. }
  638. }
  639. }
  640. DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
  641. unsigned Opcode = getSameOpcode(VL);
  642. // Check if it is safe to sink the loads or the stores.
  643. if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
  644. Instruction *Last = getLastInstruction(VL);
  645. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  646. if (VL[i] == Last)
  647. continue;
  648. Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
  649. if (Barrier) {
  650. DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
  651. << "\n because of " << *Barrier << ". Gathering.\n");
  652. newTreeEntry(VL, false);
  653. return;
  654. }
  655. }
  656. }
  657. switch (Opcode) {
  658. case Instruction::PHI: {
  659. PHINode *PH = dyn_cast<PHINode>(VL0);
  660. // Check for terminator values (e.g. invoke).
  661. for (unsigned j = 0; j < VL.size(); ++j)
  662. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  663. TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i));
  664. if (Term) {
  665. DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
  666. newTreeEntry(VL, false);
  667. return;
  668. }
  669. }
  670. newTreeEntry(VL, true);
  671. DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
  672. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  673. ValueList Operands;
  674. // Prepare the operand vector.
  675. for (unsigned j = 0; j < VL.size(); ++j)
  676. Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
  677. buildTree_rec(Operands, Depth + 1);
  678. }
  679. return;
  680. }
  681. case Instruction::ExtractElement: {
  682. bool Reuse = CanReuseExtract(VL);
  683. if (Reuse) {
  684. DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
  685. }
  686. newTreeEntry(VL, Reuse);
  687. return;
  688. }
  689. case Instruction::Load: {
  690. // Check if the loads are consecutive or of we need to swizzle them.
  691. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
  692. LoadInst *L = cast<LoadInst>(VL[i]);
  693. if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
  694. newTreeEntry(VL, false);
  695. DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
  696. return;
  697. }
  698. }
  699. newTreeEntry(VL, true);
  700. DEBUG(dbgs() << "SLP: added a vector of loads.\n");
  701. return;
  702. }
  703. case Instruction::ZExt:
  704. case Instruction::SExt:
  705. case Instruction::FPToUI:
  706. case Instruction::FPToSI:
  707. case Instruction::FPExt:
  708. case Instruction::PtrToInt:
  709. case Instruction::IntToPtr:
  710. case Instruction::SIToFP:
  711. case Instruction::UIToFP:
  712. case Instruction::Trunc:
  713. case Instruction::FPTrunc:
  714. case Instruction::BitCast: {
  715. Type *SrcTy = VL0->getOperand(0)->getType();
  716. for (unsigned i = 0; i < VL.size(); ++i) {
  717. Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
  718. if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
  719. newTreeEntry(VL, false);
  720. DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
  721. return;
  722. }
  723. }
  724. newTreeEntry(VL, true);
  725. DEBUG(dbgs() << "SLP: added a vector of casts.\n");
  726. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  727. ValueList Operands;
  728. // Prepare the operand vector.
  729. for (unsigned j = 0; j < VL.size(); ++j)
  730. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  731. buildTree_rec(Operands, Depth+1);
  732. }
  733. return;
  734. }
  735. case Instruction::ICmp:
  736. case Instruction::FCmp: {
  737. // Check that all of the compares have the same predicate.
  738. CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
  739. Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
  740. for (unsigned i = 1, e = VL.size(); i < e; ++i) {
  741. CmpInst *Cmp = cast<CmpInst>(VL[i]);
  742. if (Cmp->getPredicate() != P0 ||
  743. Cmp->getOperand(0)->getType() != ComparedTy) {
  744. newTreeEntry(VL, false);
  745. DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
  746. return;
  747. }
  748. }
  749. newTreeEntry(VL, true);
  750. DEBUG(dbgs() << "SLP: added a vector of compares.\n");
  751. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  752. ValueList Operands;
  753. // Prepare the operand vector.
  754. for (unsigned j = 0; j < VL.size(); ++j)
  755. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  756. buildTree_rec(Operands, Depth+1);
  757. }
  758. return;
  759. }
  760. case Instruction::Select:
  761. case Instruction::Add:
  762. case Instruction::FAdd:
  763. case Instruction::Sub:
  764. case Instruction::FSub:
  765. case Instruction::Mul:
  766. case Instruction::FMul:
  767. case Instruction::UDiv:
  768. case Instruction::SDiv:
  769. case Instruction::FDiv:
  770. case Instruction::URem:
  771. case Instruction::SRem:
  772. case Instruction::FRem:
  773. case Instruction::Shl:
  774. case Instruction::LShr:
  775. case Instruction::AShr:
  776. case Instruction::And:
  777. case Instruction::Or:
  778. case Instruction::Xor: {
  779. newTreeEntry(VL, true);
  780. DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
  781. // Sort operands of the instructions so that each side is more likely to
  782. // have the same opcode.
  783. if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
  784. ValueList Left, Right;
  785. reorderInputsAccordingToOpcode(VL, Left, Right);
  786. buildTree_rec(Left, Depth + 1);
  787. buildTree_rec(Right, Depth + 1);
  788. return;
  789. }
  790. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
  791. ValueList Operands;
  792. // Prepare the operand vector.
  793. for (unsigned j = 0; j < VL.size(); ++j)
  794. Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
  795. buildTree_rec(Operands, Depth+1);
  796. }
  797. return;
  798. }
  799. case Instruction::Store: {
  800. // Check if the stores are consecutive or of we need to swizzle them.
  801. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
  802. if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
  803. newTreeEntry(VL, false);
  804. DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
  805. return;
  806. }
  807. newTreeEntry(VL, true);
  808. DEBUG(dbgs() << "SLP: added a vector of stores.\n");
  809. ValueList Operands;
  810. for (unsigned j = 0; j < VL.size(); ++j)
  811. Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
  812. // We can ignore these values because we are sinking them down.
  813. MemBarrierIgnoreList.insert(VL.begin(), VL.end());
  814. buildTree_rec(Operands, Depth + 1);
  815. return;
  816. }
  817. default:
  818. newTreeEntry(VL, false);
  819. DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
  820. return;
  821. }
  822. }
  823. int BoUpSLP::getEntryCost(TreeEntry *E) {
  824. ArrayRef<Value*> VL = E->Scalars;
  825. Type *ScalarTy = VL[0]->getType();
  826. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  827. ScalarTy = SI->getValueOperand()->getType();
  828. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  829. if (E->NeedToGather) {
  830. if (allConstant(VL))
  831. return 0;
  832. if (isSplat(VL)) {
  833. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
  834. }
  835. return getGatherCost(E->Scalars);
  836. }
  837. assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
  838. "Invalid VL");
  839. Instruction *VL0 = cast<Instruction>(VL[0]);
  840. unsigned Opcode = VL0->getOpcode();
  841. switch (Opcode) {
  842. case Instruction::PHI: {
  843. return 0;
  844. }
  845. case Instruction::ExtractElement: {
  846. if (CanReuseExtract(VL))
  847. return 0;
  848. return getGatherCost(VecTy);
  849. }
  850. case Instruction::ZExt:
  851. case Instruction::SExt:
  852. case Instruction::FPToUI:
  853. case Instruction::FPToSI:
  854. case Instruction::FPExt:
  855. case Instruction::PtrToInt:
  856. case Instruction::IntToPtr:
  857. case Instruction::SIToFP:
  858. case Instruction::UIToFP:
  859. case Instruction::Trunc:
  860. case Instruction::FPTrunc:
  861. case Instruction::BitCast: {
  862. Type *SrcTy = VL0->getOperand(0)->getType();
  863. // Calculate the cost of this instruction.
  864. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
  865. VL0->getType(), SrcTy);
  866. VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
  867. int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
  868. return VecCost - ScalarCost;
  869. }
  870. case Instruction::FCmp:
  871. case Instruction::ICmp:
  872. case Instruction::Select:
  873. case Instruction::Add:
  874. case Instruction::FAdd:
  875. case Instruction::Sub:
  876. case Instruction::FSub:
  877. case Instruction::Mul:
  878. case Instruction::FMul:
  879. case Instruction::UDiv:
  880. case Instruction::SDiv:
  881. case Instruction::FDiv:
  882. case Instruction::URem:
  883. case Instruction::SRem:
  884. case Instruction::FRem:
  885. case Instruction::Shl:
  886. case Instruction::LShr:
  887. case Instruction::AShr:
  888. case Instruction::And:
  889. case Instruction::Or:
  890. case Instruction::Xor: {
  891. // Calculate the cost of this instruction.
  892. int ScalarCost = 0;
  893. int VecCost = 0;
  894. if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
  895. Opcode == Instruction::Select) {
  896. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
  897. ScalarCost = VecTy->getNumElements() *
  898. TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
  899. VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
  900. } else {
  901. // Certain instructions can be cheaper to vectorize if they have a
  902. // constant second vector operand.
  903. TargetTransformInfo::OperandValueKind Op1VK =
  904. TargetTransformInfo::OK_AnyValue;
  905. TargetTransformInfo::OperandValueKind Op2VK =
  906. TargetTransformInfo::OK_UniformConstantValue;
  907. // Check whether all second operands are constant.
  908. for (unsigned i = 0; i < VL.size(); ++i)
  909. if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) {
  910. Op2VK = TargetTransformInfo::OK_AnyValue;
  911. break;
  912. }
  913. ScalarCost =
  914. VecTy->getNumElements() *
  915. TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
  916. VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
  917. }
  918. return VecCost - ScalarCost;
  919. }
  920. case Instruction::Load: {
  921. // Cost of wide load - cost of scalar loads.
  922. int ScalarLdCost = VecTy->getNumElements() *
  923. TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
  924. int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
  925. return VecLdCost - ScalarLdCost;
  926. }
  927. case Instruction::Store: {
  928. // We know that we can merge the stores. Calculate the cost.
  929. int ScalarStCost = VecTy->getNumElements() *
  930. TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
  931. int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
  932. return VecStCost - ScalarStCost;
  933. }
  934. default:
  935. llvm_unreachable("Unknown instruction");
  936. }
  937. }
  938. bool BoUpSLP::isFullyVectorizableTinyTree() {
  939. DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
  940. VectorizableTree.size() << " is fully vectorizable .\n");
  941. // We only handle trees of height 2.
  942. if (VectorizableTree.size() != 2)
  943. return false;
  944. // Gathering cost would be too much for tiny trees.
  945. if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
  946. return false;
  947. return true;
  948. }
  949. int BoUpSLP::getTreeCost() {
  950. int Cost = 0;
  951. DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
  952. VectorizableTree.size() << ".\n");
  953. // We only vectorize tiny trees if it is fully vectorizable.
  954. if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
  955. if (!VectorizableTree.size()) {
  956. assert(!ExternalUses.size() && "We should not have any external users");
  957. }
  958. return INT_MAX;
  959. }
  960. unsigned BundleWidth = VectorizableTree[0].Scalars.size();
  961. for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
  962. int C = getEntryCost(&VectorizableTree[i]);
  963. DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
  964. << *VectorizableTree[i].Scalars[0] << " .\n");
  965. Cost += C;
  966. }
  967. SmallSet<Value *, 16> ExtractCostCalculated;
  968. int ExtractCost = 0;
  969. for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
  970. I != E; ++I) {
  971. // We only add extract cost once for the same scalar.
  972. if (!ExtractCostCalculated.insert(I->Scalar))
  973. continue;
  974. VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
  975. ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
  976. I->Lane);
  977. }
  978. DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
  979. return Cost + ExtractCost;
  980. }
  981. int BoUpSLP::getGatherCost(Type *Ty) {
  982. int Cost = 0;
  983. for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
  984. Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
  985. return Cost;
  986. }
  987. int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
  988. // Find the type of the operands in VL.
  989. Type *ScalarTy = VL[0]->getType();
  990. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  991. ScalarTy = SI->getValueOperand()->getType();
  992. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  993. // Find the cost of inserting/extracting values from the vector.
  994. return getGatherCost(VecTy);
  995. }
  996. AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
  997. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  998. return AA->getLocation(SI);
  999. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  1000. return AA->getLocation(LI);
  1001. return AliasAnalysis::Location();
  1002. }
  1003. Value *BoUpSLP::getPointerOperand(Value *I) {
  1004. if (LoadInst *LI = dyn_cast<LoadInst>(I))
  1005. return LI->getPointerOperand();
  1006. if (StoreInst *SI = dyn_cast<StoreInst>(I))
  1007. return SI->getPointerOperand();
  1008. return 0;
  1009. }
  1010. unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
  1011. if (LoadInst *L = dyn_cast<LoadInst>(I))
  1012. return L->getPointerAddressSpace();
  1013. if (StoreInst *S = dyn_cast<StoreInst>(I))
  1014. return S->getPointerAddressSpace();
  1015. return -1;
  1016. }
  1017. bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
  1018. Value *PtrA = getPointerOperand(A);
  1019. Value *PtrB = getPointerOperand(B);
  1020. unsigned ASA = getAddressSpaceOperand(A);
  1021. unsigned ASB = getAddressSpaceOperand(B);
  1022. // Check that the address spaces match and that the pointers are valid.
  1023. if (!PtrA || !PtrB || (ASA != ASB))
  1024. return false;
  1025. // Make sure that A and B are different pointers of the same type.
  1026. if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
  1027. return false;
  1028. unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
  1029. Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
  1030. APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
  1031. APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
  1032. PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
  1033. PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
  1034. APInt OffsetDelta = OffsetB - OffsetA;
  1035. // Check if they are based on the same pointer. That makes the offsets
  1036. // sufficient.
  1037. if (PtrA == PtrB)
  1038. return OffsetDelta == Size;
  1039. // Compute the necessary base pointer delta to have the necessary final delta
  1040. // equal to the size.
  1041. APInt BaseDelta = Size - OffsetDelta;
  1042. // Otherwise compute the distance with SCEV between the base pointers.
  1043. const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
  1044. const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
  1045. const SCEV *C = SE->getConstant(BaseDelta);
  1046. const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
  1047. return X == PtrSCEVB;
  1048. }
  1049. Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
  1050. assert(Src->getParent() == Dst->getParent() && "Not the same BB");
  1051. BasicBlock::iterator I = Src, E = Dst;
  1052. /// Scan all of the instruction from SRC to DST and check if
  1053. /// the source may alias.
  1054. for (++I; I != E; ++I) {
  1055. // Ignore store instructions that are marked as 'ignore'.
  1056. if (MemBarrierIgnoreList.count(I))
  1057. continue;
  1058. if (Src->mayWriteToMemory()) /* Write */ {
  1059. if (!I->mayReadOrWriteMemory())
  1060. continue;
  1061. } else /* Read */ {
  1062. if (!I->mayWriteToMemory())
  1063. continue;
  1064. }
  1065. AliasAnalysis::Location A = getLocation(&*I);
  1066. AliasAnalysis::Location B = getLocation(Src);
  1067. if (!A.Ptr || !B.Ptr || AA->alias(A, B))
  1068. return I;
  1069. }
  1070. return 0;
  1071. }
  1072. int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
  1073. BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
  1074. assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
  1075. BlockNumbering &BN = BlocksNumbers[BB];
  1076. int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
  1077. for (unsigned i = 0, e = VL.size(); i < e; ++i)
  1078. MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
  1079. return MaxIdx;
  1080. }
  1081. Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
  1082. BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
  1083. assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
  1084. BlockNumbering &BN = BlocksNumbers[BB];
  1085. int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
  1086. for (unsigned i = 1, e = VL.size(); i < e; ++i)
  1087. MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
  1088. Instruction *I = BN.getInstruction(MaxIdx);
  1089. assert(I && "bad location");
  1090. return I;
  1091. }
  1092. void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
  1093. Instruction *VL0 = cast<Instruction>(VL[0]);
  1094. Instruction *LastInst = getLastInstruction(VL);
  1095. BasicBlock::iterator NextInst = LastInst;
  1096. ++NextInst;
  1097. Builder.SetInsertPoint(VL0->getParent(), NextInst);
  1098. Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
  1099. }
  1100. Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
  1101. Value *Vec = UndefValue::get(Ty);
  1102. // Generate the 'InsertElement' instruction.
  1103. for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
  1104. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
  1105. if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
  1106. GatherSeq.insert(Insrt);
  1107. CSEBlocks.insert(Insrt->getParent());
  1108. // Add to our 'need-to-extract' list.
  1109. if (ScalarToTreeEntry.count(VL[i])) {
  1110. int Idx = ScalarToTreeEntry[VL[i]];
  1111. TreeEntry *E = &VectorizableTree[Idx];
  1112. // Find which lane we need to extract.
  1113. int FoundLane = -1;
  1114. for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
  1115. // Is this the lane of the scalar that we are looking for ?
  1116. if (E->Scalars[Lane] == VL[i]) {
  1117. FoundLane = Lane;
  1118. break;
  1119. }
  1120. }
  1121. assert(FoundLane >= 0 && "Could not find the correct lane");
  1122. ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
  1123. }
  1124. }
  1125. }
  1126. return Vec;
  1127. }
  1128. Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
  1129. SmallDenseMap<Value*, int>::const_iterator Entry
  1130. = ScalarToTreeEntry.find(VL[0]);
  1131. if (Entry != ScalarToTreeEntry.end()) {
  1132. int Idx = Entry->second;
  1133. const TreeEntry *En = &VectorizableTree[Idx];
  1134. if (En->isSame(VL) && En->VectorizedValue)
  1135. return En->VectorizedValue;
  1136. }
  1137. return 0;
  1138. }
  1139. Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
  1140. if (ScalarToTreeEntry.count(VL[0])) {
  1141. int Idx = ScalarToTreeEntry[VL[0]];
  1142. TreeEntry *E = &VectorizableTree[Idx];
  1143. if (E->isSame(VL))
  1144. return vectorizeTree(E);
  1145. }
  1146. Type *ScalarTy = VL[0]->getType();
  1147. if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
  1148. ScalarTy = SI->getValueOperand()->getType();
  1149. VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
  1150. return Gather(VL, VecTy);
  1151. }
  1152. Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
  1153. IRBuilder<>::InsertPointGuard Guard(Builder);
  1154. if (E->VectorizedValue) {
  1155. DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
  1156. return E->VectorizedValue;
  1157. }
  1158. Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
  1159. Type *ScalarTy = VL0->getType();
  1160. if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
  1161. ScalarTy = SI->getValueOperand()->getType();
  1162. VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
  1163. if (E->NeedToGather) {
  1164. setInsertPointAfterBundle(E->Scalars);
  1165. return Gather(E->Scalars, VecTy);
  1166. }
  1167. unsigned Opcode = VL0->getOpcode();
  1168. assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
  1169. switch (Opcode) {
  1170. case Instruction::PHI: {
  1171. PHINode *PH = dyn_cast<PHINode>(VL0);
  1172. Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
  1173. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1174. PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
  1175. E->VectorizedValue = NewPhi;
  1176. // PHINodes may have multiple entries from the same block. We want to
  1177. // visit every block once.
  1178. SmallSet<BasicBlock*, 4> VisitedBBs;
  1179. for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
  1180. ValueList Operands;
  1181. BasicBlock *IBB = PH->getIncomingBlock(i);
  1182. if (!VisitedBBs.insert(IBB)) {
  1183. NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
  1184. continue;
  1185. }
  1186. // Prepare the operand vector.
  1187. for (unsigned j = 0; j < E->Scalars.size(); ++j)
  1188. Operands.push_back(cast<PHINode>(E->Scalars[j])->
  1189. getIncomingValueForBlock(IBB));
  1190. Builder.SetInsertPoint(IBB->getTerminator());
  1191. Builder.SetCurrentDebugLocation(PH->getDebugLoc());
  1192. Value *Vec = vectorizeTree(Operands);
  1193. NewPhi->addIncoming(Vec, IBB);
  1194. }
  1195. assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
  1196. "Invalid number of incoming values");
  1197. return NewPhi;
  1198. }
  1199. case Instruction::ExtractElement: {
  1200. if (CanReuseExtract(E->Scalars)) {
  1201. Value *V = VL0->getOperand(0);
  1202. E->VectorizedValue = V;
  1203. return V;
  1204. }
  1205. return Gather(E->Scalars, VecTy);
  1206. }
  1207. case Instruction::ZExt:
  1208. case Instruction::SExt:
  1209. case Instruction::FPToUI:
  1210. case Instruction::FPToSI:
  1211. case Instruction::FPExt:
  1212. case Instruction::PtrToInt:
  1213. case Instruction::IntToPtr:
  1214. case Instruction::SIToFP:
  1215. case Instruction::UIToFP:
  1216. case Instruction::Trunc:
  1217. case Instruction::FPTrunc:
  1218. case Instruction::BitCast: {
  1219. ValueList INVL;
  1220. for (int i = 0, e = E->Scalars.size(); i < e; ++i)
  1221. INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
  1222. setInsertPointAfterBundle(E->Scalars);
  1223. Value *InVec = vectorizeTree(INVL);
  1224. if (Value *V = alreadyVectorized(E->Scalars))
  1225. return V;
  1226. CastInst *CI = dyn_cast<CastInst>(VL0);
  1227. Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
  1228. E->VectorizedValue = V;
  1229. return V;
  1230. }
  1231. case Instruction::FCmp:
  1232. case Instruction::ICmp: {
  1233. ValueList LHSV, RHSV;
  1234. for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
  1235. LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
  1236. RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
  1237. }
  1238. setInsertPointAfterBundle(E->Scalars);
  1239. Value *L = vectorizeTree(LHSV);
  1240. Value *R = vectorizeTree(RHSV);
  1241. if (Value *V = alreadyVectorized(E->Scalars))
  1242. return V;
  1243. CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
  1244. Value *V;
  1245. if (Opcode == Instruction::FCmp)
  1246. V = Builder.CreateFCmp(P0, L, R);
  1247. else
  1248. V = Builder.CreateICmp(P0, L, R);
  1249. E->VectorizedValue = V;
  1250. return V;
  1251. }
  1252. case Instruction::Select: {
  1253. ValueList TrueVec, FalseVec, CondVec;
  1254. for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
  1255. CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
  1256. TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
  1257. FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
  1258. }
  1259. setInsertPointAfterBundle(E->Scalars);
  1260. Value *Cond = vectorizeTree(CondVec);
  1261. Value *True = vectorizeTree(TrueVec);
  1262. Value *False = vectorizeTree(FalseVec);
  1263. if (Value *V = alreadyVectorized(E->Scalars))
  1264. return V;
  1265. Value *V = Builder.CreateSelect(Cond, True, False);
  1266. E->VectorizedValue = V;
  1267. return V;
  1268. }
  1269. case Instruction::Add:
  1270. case Instruction::FAdd:
  1271. case Instruction::Sub:
  1272. case Instruction::FSub:
  1273. case Instruction::Mul:
  1274. case Instruction::FMul:
  1275. case Instruction::UDiv:
  1276. case Instruction::SDiv:
  1277. case Instruction::FDiv:
  1278. case Instruction::URem:
  1279. case Instruction::SRem:
  1280. case Instruction::FRem:
  1281. case Instruction::Shl:
  1282. case Instruction::LShr:
  1283. case Instruction::AShr:
  1284. case Instruction::And:
  1285. case Instruction::Or:
  1286. case Instruction::Xor: {
  1287. ValueList LHSVL, RHSVL;
  1288. if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
  1289. reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
  1290. else
  1291. for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
  1292. LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
  1293. RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
  1294. }
  1295. setInsertPointAfterBundle(E->Scalars);
  1296. Value *LHS = vectorizeTree(LHSVL);
  1297. Value *RHS = vectorizeTree(RHSVL);
  1298. if (LHS == RHS && isa<Instruction>(LHS)) {
  1299. assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
  1300. }
  1301. if (Value *V = alreadyVectorized(E->Scalars))
  1302. return V;
  1303. BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
  1304. Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
  1305. E->VectorizedValue = V;
  1306. if (Instruction *I = dyn_cast<Instruction>(V))
  1307. return propagateMetadata(I, E->Scalars);
  1308. return V;
  1309. }
  1310. case Instruction::Load: {
  1311. // Loads are inserted at the head of the tree because we don't want to
  1312. // sink them all the way down past store instructions.
  1313. setInsertPointAfterBundle(E->Scalars);
  1314. LoadInst *LI = cast<LoadInst>(VL0);
  1315. unsigned AS = LI->getPointerAddressSpace();
  1316. Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
  1317. VecTy->getPointerTo(AS));
  1318. unsigned Alignment = LI->getAlignment();
  1319. LI = Builder.CreateLoad(VecPtr);
  1320. LI->setAlignment(Alignment);
  1321. E->VectorizedValue = LI;
  1322. return propagateMetadata(LI, E->Scalars);
  1323. }
  1324. case Instruction::Store: {
  1325. StoreInst *SI = cast<StoreInst>(VL0);
  1326. unsigned Alignment = SI->getAlignment();
  1327. unsigned AS = SI->getPointerAddressSpace();
  1328. ValueList ValueOp;
  1329. for (int i = 0, e = E->Scalars.size(); i < e; ++i)
  1330. ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
  1331. setInsertPointAfterBundle(E->Scalars);
  1332. Value *VecValue = vectorizeTree(ValueOp);
  1333. Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
  1334. VecTy->getPointerTo(AS));
  1335. StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
  1336. S->setAlignment(Alignment);
  1337. E->VectorizedValue = S;
  1338. return propagateMetadata(S, E->Scalars);
  1339. }
  1340. default:
  1341. llvm_unreachable("unknown inst");
  1342. }
  1343. return 0;
  1344. }
  1345. Value *BoUpSLP::vectorizeTree() {
  1346. Builder.SetInsertPoint(F->getEntryBlock().begin());
  1347. vectorizeTree(&VectorizableTree[0]);
  1348. DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
  1349. // Extract all of the elements with the external uses.
  1350. for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
  1351. it != e; ++it) {
  1352. Value *Scalar = it->Scalar;
  1353. llvm::User *User = it->User;
  1354. // Skip users that we already RAUW. This happens when one instruction
  1355. // has multiple uses of the same value.
  1356. if (std::find(Scalar->use_begin(), Scalar->use_end(), User) ==
  1357. Scalar->use_end())
  1358. continue;
  1359. assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
  1360. int Idx = ScalarToTreeEntry[Scalar];
  1361. TreeEntry *E = &VectorizableTree[Idx];
  1362. assert(!E->NeedToGather && "Extracting from a gather list");
  1363. Value *Vec = E->VectorizedValue;
  1364. assert(Vec && "Can't find vectorizable value");
  1365. Value *Lane = Builder.getInt32(it->Lane);
  1366. // Generate extracts for out-of-tree users.
  1367. // Find the insertion point for the extractelement lane.
  1368. if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
  1369. Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
  1370. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  1371. CSEBlocks.insert(PN->getParent());
  1372. User->replaceUsesOfWith(Scalar, Ex);
  1373. } else if (isa<Instruction>(Vec)){
  1374. if (PHINode *PH = dyn_cast<PHINode>(User)) {
  1375. for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
  1376. if (PH->getIncomingValue(i) == Scalar) {
  1377. Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
  1378. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  1379. CSEBlocks.insert(PH->getIncomingBlock(i));
  1380. PH->setOperand(i, Ex);
  1381. }
  1382. }
  1383. } else {
  1384. Builder.SetInsertPoint(cast<Instruction>(User));
  1385. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  1386. CSEBlocks.insert(cast<Instruction>(User)->getParent());
  1387. User->replaceUsesOfWith(Scalar, Ex);
  1388. }
  1389. } else {
  1390. Builder.SetInsertPoint(F->getEntryBlock().begin());
  1391. Value *Ex = Builder.CreateExtractElement(Vec, Lane);
  1392. CSEBlocks.insert(&F->getEntryBlock());
  1393. User->replaceUsesOfWith(Scalar, Ex);
  1394. }
  1395. DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
  1396. }
  1397. // For each vectorized value:
  1398. for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
  1399. TreeEntry *Entry = &VectorizableTree[EIdx];
  1400. // For each lane:
  1401. for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
  1402. Value *Scalar = Entry->Scalars[Lane];
  1403. // No need to handle users of gathered values.
  1404. if (Entry->NeedToGather)
  1405. continue;
  1406. assert(Entry->VectorizedValue && "Can't find vectorizable value");
  1407. Type *Ty = Scalar->getType();
  1408. if (!Ty->isVoidTy()) {
  1409. for (Value::use_iterator User = Scalar->use_begin(),
  1410. UE = Scalar->use_end(); User != UE; ++User) {
  1411. DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
  1412. assert((ScalarToTreeEntry.count(*User) ||
  1413. // It is legal to replace the reduction users by undef.
  1414. (RdxOps && RdxOps->count(*User))) &&
  1415. "Replacing out-of-tree value with undef");
  1416. }
  1417. Value *Undef = UndefValue::get(Ty);
  1418. Scalar->replaceAllUsesWith(Undef);
  1419. }
  1420. DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
  1421. cast<Instruction>(Scalar)->eraseFromParent();
  1422. }
  1423. }
  1424. for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
  1425. BlocksNumbers[it].forget();
  1426. }
  1427. Builder.ClearInsertionPoint();
  1428. return VectorizableTree[0].VectorizedValue;
  1429. }
  1430. class DTCmp {
  1431. const DominatorTree *DT;
  1432. public:
  1433. DTCmp(const DominatorTree *DT) : DT(DT) {}
  1434. bool operator()(const BasicBlock *A, const BasicBlock *B) const {
  1435. return DT->properlyDominates(A, B);
  1436. }
  1437. };
  1438. void BoUpSLP::optimizeGatherSequence() {
  1439. DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
  1440. << " gather sequences instructions.\n");
  1441. // LICM InsertElementInst sequences.
  1442. for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
  1443. e = GatherSeq.end(); it != e; ++it) {
  1444. InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
  1445. if (!Insert)
  1446. continue;
  1447. // Check if this block is inside a loop.
  1448. Loop *L = LI->getLoopFor(Insert->getParent());
  1449. if (!L)
  1450. continue;
  1451. // Check if it has a preheader.
  1452. BasicBlock *PreHeader = L->getLoopPreheader();
  1453. if (!PreHeader)
  1454. continue;
  1455. // If the vector or the element that we insert into it are
  1456. // instructions that are defined in this basic block then we can't
  1457. // hoist this instruction.
  1458. Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
  1459. Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
  1460. if (CurrVec && L->contains(CurrVec))
  1461. continue;
  1462. if (NewElem && L->contains(NewElem))
  1463. continue;
  1464. // We can hoist this instruction. Move it to the pre-header.
  1465. Insert->moveBefore(PreHeader->getTerminator());
  1466. }
  1467. // Sort blocks by domination. This ensures we visit a block after all blocks
  1468. // dominating it are visited.
  1469. SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
  1470. std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT));
  1471. // Perform O(N^2) search over the gather sequences and merge identical
  1472. // instructions. TODO: We can further optimize this scan if we split the
  1473. // instructions into different buckets based on the insert lane.
  1474. SmallVector<Instruction *, 16> Visited;
  1475. for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
  1476. E = CSEWorkList.end();
  1477. I != E; ++I) {
  1478. assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) &&
  1479. "Worklist not sorted properly!");
  1480. BasicBlock *BB = *I;
  1481. // For all instructions in blocks containing gather sequences:
  1482. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
  1483. Instruction *In = it++;
  1484. if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
  1485. continue;
  1486. // Check if we can replace this instruction with any of the
  1487. // visited instructions.
  1488. for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
  1489. ve = Visited.end();
  1490. v != ve; ++v) {
  1491. if (In->isIdenticalTo(*v) &&
  1492. DT->dominates((*v)->getParent(), In->getParent())) {
  1493. In->replaceAllUsesWith(*v);
  1494. In->eraseFromParent();
  1495. In = 0;
  1496. break;
  1497. }
  1498. }
  1499. if (In) {
  1500. assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
  1501. Visited.push_back(In);
  1502. }
  1503. }
  1504. }
  1505. CSEBlocks.clear();
  1506. GatherSeq.clear();
  1507. }
  1508. /// The SLPVectorizer Pass.
  1509. struct SLPVectorizer : public FunctionPass {
  1510. typedef SmallVector<StoreInst *, 8> StoreList;
  1511. typedef MapVector<Value *, StoreList> StoreListMap;
  1512. /// Pass identification, replacement for typeid
  1513. static char ID;
  1514. explicit SLPVectorizer() : FunctionPass(ID) {
  1515. initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
  1516. }
  1517. ScalarEvolution *SE;
  1518. DataLayout *DL;
  1519. TargetTransformInfo *TTI;
  1520. AliasAnalysis *AA;
  1521. LoopInfo *LI;
  1522. DominatorTree *DT;
  1523. virtual bool runOnFunction(Function &F) {
  1524. SE = &getAnalysis<ScalarEvolution>();
  1525. DL = getAnalysisIfAvailable<DataLayout>();
  1526. TTI = &getAnalysis<TargetTransformInfo>();
  1527. AA = &getAnalysis<AliasAnalysis>();
  1528. LI = &getAnalysis<LoopInfo>();
  1529. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  1530. StoreRefs.clear();
  1531. bool Changed = false;
  1532. // If the target claims to have no vector registers don't attempt
  1533. // vectorization.
  1534. if (!TTI->getNumberOfRegisters(true))
  1535. return false;
  1536. // Must have DataLayout. We can't require it because some tests run w/o
  1537. // triple.
  1538. if (!DL)
  1539. return false;
  1540. // Don't vectorize when the attribute NoImplicitFloat is used.
  1541. if (F.hasFnAttribute(Attribute::NoImplicitFloat))
  1542. return false;
  1543. DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
  1544. // Use the bollom up slp vectorizer to construct chains that start with
  1545. // he store instructions.
  1546. BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
  1547. // Scan the blocks in the function in post order.
  1548. for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
  1549. e = po_end(&F.getEntryBlock()); it != e; ++it) {
  1550. BasicBlock *BB = *it;
  1551. // Vectorize trees that end at stores.
  1552. if (unsigned count = collectStores(BB, R)) {
  1553. (void)count;
  1554. DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
  1555. Changed |= vectorizeStoreChains(R);
  1556. }
  1557. // Vectorize trees that end at reductions.
  1558. Changed |= vectorizeChainsInBlock(BB, R);
  1559. }
  1560. if (Changed) {
  1561. R.optimizeGatherSequence();
  1562. DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
  1563. DEBUG(verifyFunction(F));
  1564. }
  1565. return Changed;
  1566. }
  1567. virtual void getAnalysisUsage(AnalysisUsage &AU) const {
  1568. FunctionPass::getAnalysisUsage(AU);
  1569. AU.addRequired<ScalarEvolution>();
  1570. AU.addRequired<AliasAnalysis>();
  1571. AU.addRequired<TargetTransformInfo>();
  1572. AU.addRequired<LoopInfo>();
  1573. AU.addRequired<DominatorTreeWrapperPass>();
  1574. AU.addPreserved<LoopInfo>();
  1575. AU.addPreserved<DominatorTreeWrapperPass>();
  1576. AU.setPreservesCFG();
  1577. }
  1578. private:
  1579. /// \brief Collect memory references and sort them according to their base
  1580. /// object. We sort the stores to their base objects to reduce the cost of the
  1581. /// quadratic search on the stores. TODO: We can further reduce this cost
  1582. /// if we flush the chain creation every time we run into a memory barrier.
  1583. unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
  1584. /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
  1585. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
  1586. /// \brief Try to vectorize a list of operands.
  1587. /// \returns true if a value was vectorized.
  1588. bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
  1589. /// \brief Try to vectorize a chain that may start at the operands of \V;
  1590. bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
  1591. /// \brief Vectorize the stores that were collected in StoreRefs.
  1592. bool vectorizeStoreChains(BoUpSLP &R);
  1593. /// \brief Scan the basic block and look for patterns that are likely to start
  1594. /// a vectorization chain.
  1595. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
  1596. bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
  1597. BoUpSLP &R);
  1598. bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
  1599. BoUpSLP &R);
  1600. private:
  1601. StoreListMap StoreRefs;
  1602. };
  1603. /// \brief Check that the Values in the slice in VL array are still existent in
  1604. /// the WeakVH array.
  1605. /// Vectorization of part of the VL array may cause later values in the VL array
  1606. /// to become invalid. We track when this has happened in the WeakVH array.
  1607. static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
  1608. SmallVectorImpl<WeakVH> &VH,
  1609. unsigned SliceBegin,
  1610. unsigned SliceSize) {
  1611. for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
  1612. if (VH[i] != VL[i])
  1613. return true;
  1614. return false;
  1615. }
  1616. bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
  1617. int CostThreshold, BoUpSLP &R) {
  1618. unsigned ChainLen = Chain.size();
  1619. DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
  1620. << "\n");
  1621. Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
  1622. unsigned Sz = DL->getTypeSizeInBits(StoreTy);
  1623. unsigned VF = MinVecRegSize / Sz;
  1624. if (!isPowerOf2_32(Sz) || VF < 2)
  1625. return false;
  1626. // Keep track of values that were delete by vectorizing in the loop below.
  1627. SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
  1628. bool Changed = false;
  1629. // Look for profitable vectorizable trees at all offsets, starting at zero.
  1630. for (unsigned i = 0, e = ChainLen; i < e; ++i) {
  1631. if (i + VF > e)
  1632. break;
  1633. // Check that a previous iteration of this loop did not delete the Value.
  1634. if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
  1635. continue;
  1636. DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
  1637. << "\n");
  1638. ArrayRef<Value *> Operands = Chain.slice(i, VF);
  1639. R.buildTree(Operands);
  1640. int Cost = R.getTreeCost();
  1641. DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
  1642. if (Cost < CostThreshold) {
  1643. DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
  1644. R.vectorizeTree();
  1645. // Move to the next bundle.
  1646. i += VF - 1;
  1647. Changed = true;
  1648. }
  1649. }
  1650. return Changed;
  1651. }
  1652. bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
  1653. int costThreshold, BoUpSLP &R) {
  1654. SetVector<Value *> Heads, Tails;
  1655. SmallDenseMap<Value *, Value *> ConsecutiveChain;
  1656. // We may run into multiple chains that merge into a single chain. We mark the
  1657. // stores that we vectorized so that we don't visit the same store twice.
  1658. BoUpSLP::ValueSet VectorizedStores;
  1659. bool Changed = false;
  1660. // Do a quadratic search on all of the given stores and find
  1661. // all of the pairs of stores that follow each other.
  1662. for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
  1663. for (unsigned j = 0; j < e; ++j) {
  1664. if (i == j)
  1665. continue;
  1666. if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
  1667. Tails.insert(Stores[j]);
  1668. Heads.insert(Stores[i]);
  1669. ConsecutiveChain[Stores[i]] = Stores[j];
  1670. }
  1671. }
  1672. }
  1673. // For stores that start but don't end a link in the chain:
  1674. for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
  1675. it != e; ++it) {
  1676. if (Tails.count(*it))
  1677. continue;
  1678. // We found a store instr that starts a chain. Now follow the chain and try
  1679. // to vectorize it.
  1680. BoUpSLP::ValueList Operands;
  1681. Value *I = *it;
  1682. // Collect the chain into a list.
  1683. while (Tails.count(I) || Heads.count(I)) {
  1684. if (VectorizedStores.count(I))
  1685. break;
  1686. Operands.push_back(I);
  1687. // Move to the next value in the chain.
  1688. I = ConsecutiveChain[I];
  1689. }
  1690. bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
  1691. // Mark the vectorized stores so that we don't vectorize them again.
  1692. if (Vectorized)
  1693. VectorizedStores.insert(Operands.begin(), Operands.end());
  1694. Changed |= Vectorized;
  1695. }
  1696. return Changed;
  1697. }
  1698. unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
  1699. unsigned count = 0;
  1700. StoreRefs.clear();
  1701. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
  1702. StoreInst *SI = dyn_cast<StoreInst>(it);
  1703. if (!SI)
  1704. continue;
  1705. // Don't touch volatile stores.
  1706. if (!SI->isSimple())
  1707. continue;
  1708. // Check that the pointer points to scalars.
  1709. Type *Ty = SI->getValueOperand()->getType();
  1710. if (Ty->isAggregateType() || Ty->isVectorTy())
  1711. return 0;
  1712. // Find the base pointer.
  1713. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
  1714. // Save the store locations.
  1715. StoreRefs[Ptr].push_back(SI);
  1716. count++;
  1717. }
  1718. return count;
  1719. }
  1720. bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
  1721. if (!A || !B)
  1722. return false;
  1723. Value *VL[] = { A, B };
  1724. return tryToVectorizeList(VL, R);
  1725. }
  1726. bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
  1727. if (VL.size() < 2)
  1728. return false;
  1729. DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
  1730. // Check that all of the parts are scalar instructions of the same type.
  1731. Instruction *I0 = dyn_cast<Instruction>(VL[0]);
  1732. if (!I0)
  1733. return false;
  1734. unsigned Opcode0 = I0->getOpcode();
  1735. Type *Ty0 = I0->getType();
  1736. unsigned Sz = DL->getTypeSizeInBits(Ty0);
  1737. unsigned VF = MinVecRegSize / Sz;
  1738. for (int i = 0, e = VL.size(); i < e; ++i) {
  1739. Type *Ty = VL[i]->getType();
  1740. if (Ty->isAggregateType() || Ty->isVectorTy())
  1741. return false;
  1742. Instruction *Inst = dyn_cast<Instruction>(VL[i]);
  1743. if (!Inst || Inst->getOpcode() != Opcode0)
  1744. return false;
  1745. }
  1746. bool Changed = false;
  1747. // Keep track of values that were delete by vectorizing in the loop below.
  1748. SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
  1749. for (unsigned i = 0, e = VL.size(); i < e; ++i) {
  1750. unsigned OpsWidth = 0;
  1751. if (i + VF > e)
  1752. OpsWidth = e - i;
  1753. else
  1754. OpsWidth = VF;
  1755. if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
  1756. break;
  1757. // Check that a previous iteration of this loop did not delete the Value.
  1758. if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
  1759. continue;
  1760. DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
  1761. << "\n");
  1762. ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
  1763. R.buildTree(Ops);
  1764. int Cost = R.getTreeCost();
  1765. if (Cost < -SLPCostThreshold) {
  1766. DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
  1767. R.vectorizeTree();
  1768. // Move to the next bundle.
  1769. i += VF - 1;
  1770. Changed = true;
  1771. }
  1772. }
  1773. return Changed;
  1774. }
  1775. bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
  1776. if (!V)
  1777. return false;
  1778. // Try to vectorize V.
  1779. if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
  1780. return true;
  1781. BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
  1782. BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
  1783. // Try to skip B.
  1784. if (B && B->hasOneUse()) {
  1785. BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
  1786. BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
  1787. if (tryToVectorizePair(A, B0, R)) {
  1788. B->moveBefore(V);
  1789. return true;
  1790. }
  1791. if (tryToVectorizePair(A, B1, R)) {
  1792. B->moveBefore(V);
  1793. return true;
  1794. }
  1795. }
  1796. // Try to skip A.
  1797. if (A && A->hasOneUse()) {
  1798. BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
  1799. BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
  1800. if (tryToVectorizePair(A0, B, R)) {
  1801. A->moveBefore(V);
  1802. return true;
  1803. }
  1804. if (tryToVectorizePair(A1, B, R)) {
  1805. A->moveBefore(V);
  1806. return true;
  1807. }
  1808. }
  1809. return 0;
  1810. }
  1811. /// \brief Generate a shuffle mask to be used in a reduction tree.
  1812. ///
  1813. /// \param VecLen The length of the vector to be reduced.
  1814. /// \param NumEltsToRdx The number of elements that should be reduced in the
  1815. /// vector.
  1816. /// \param IsPairwise Whether the reduction is a pairwise or splitting
  1817. /// reduction. A pairwise reduction will generate a mask of
  1818. /// <0,2,...> or <1,3,..> while a splitting reduction will generate
  1819. /// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
  1820. /// \param IsLeft True will generate a mask of even elements, odd otherwise.
  1821. static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
  1822. bool IsPairwise, bool IsLeft,
  1823. IRBuilder<> &Builder) {
  1824. assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
  1825. SmallVector<Constant *, 32> ShuffleMask(
  1826. VecLen, UndefValue::get(Builder.getInt32Ty()));
  1827. if (IsPairwise)
  1828. // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
  1829. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  1830. ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
  1831. else
  1832. // Move the upper half of the vector to the lower half.
  1833. for (unsigned i = 0; i != NumEltsToRdx; ++i)
  1834. ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
  1835. return ConstantVector::get(ShuffleMask);
  1836. }
  1837. /// Model horizontal reductions.
  1838. ///
  1839. /// A horizontal reduction is a tree of reduction operations (currently add and
  1840. /// fadd) that has operations that can be put into a vector as its leaf.
  1841. /// For example, this tree:
  1842. ///
  1843. /// mul mul mul mul
  1844. /// \ / \ /
  1845. /// + +
  1846. /// \ /
  1847. /// +
  1848. /// This tree has "mul" as its reduced values and "+" as its reduction
  1849. /// operations. A reduction might be feeding into a store or a binary operation
  1850. /// feeding a phi.
  1851. /// ...
  1852. /// \ /
  1853. /// +
  1854. /// |
  1855. /// phi +=
  1856. ///
  1857. /// Or:
  1858. /// ...
  1859. /// \ /
  1860. /// +
  1861. /// |
  1862. /// *p =
  1863. ///
  1864. class HorizontalReduction {
  1865. SmallPtrSet<Value *, 16> ReductionOps;
  1866. SmallVector<Value *, 32> ReducedVals;
  1867. BinaryOperator *ReductionRoot;
  1868. PHINode *ReductionPHI;
  1869. /// The opcode of the reduction.
  1870. unsigned ReductionOpcode;
  1871. /// The opcode of the values we perform a reduction on.
  1872. unsigned ReducedValueOpcode;
  1873. /// The width of one full horizontal reduction operation.
  1874. unsigned ReduxWidth;
  1875. /// Should we model this reduction as a pairwise reduction tree or a tree that
  1876. /// splits the vector in halves and adds those halves.
  1877. bool IsPairwiseReduction;
  1878. public:
  1879. HorizontalReduction()
  1880. : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
  1881. ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
  1882. /// \brief Try to find a reduction tree.
  1883. bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
  1884. DataLayout *DL) {
  1885. assert((!Phi ||
  1886. std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
  1887. "Thi phi needs to use the binary operator");
  1888. // We could have a initial reductions that is not an add.
  1889. // r *= v1 + v2 + v3 + v4
  1890. // In such a case start looking for a tree rooted in the first '+'.
  1891. if (Phi) {
  1892. if (B->getOperand(0) == Phi) {
  1893. Phi = 0;
  1894. B = dyn_cast<BinaryOperator>(B->getOperand(1));
  1895. } else if (B->getOperand(1) == Phi) {
  1896. Phi = 0;
  1897. B = dyn_cast<BinaryOperator>(B->getOperand(0));
  1898. }
  1899. }
  1900. if (!B)
  1901. return false;
  1902. Type *Ty = B->getType();
  1903. if (Ty->isVectorTy())
  1904. return false;
  1905. ReductionOpcode = B->getOpcode();
  1906. ReducedValueOpcode = 0;
  1907. ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
  1908. ReductionRoot = B;
  1909. ReductionPHI = Phi;
  1910. if (ReduxWidth < 4)
  1911. return false;
  1912. // We currently only support adds.
  1913. if (ReductionOpcode != Instruction::Add &&
  1914. ReductionOpcode != Instruction::FAdd)
  1915. return false;
  1916. // Post order traverse the reduction tree starting at B. We only handle true
  1917. // trees containing only binary operators.
  1918. SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
  1919. Stack.push_back(std::make_pair(B, 0));
  1920. while (!Stack.empty()) {
  1921. BinaryOperator *TreeN = Stack.back().first;
  1922. unsigned EdgeToVist = Stack.back().second++;
  1923. bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
  1924. // Only handle trees in the current basic block.
  1925. if (TreeN->getParent() != B->getParent())
  1926. return false;
  1927. // Each tree node needs to have one user except for the ultimate
  1928. // reduction.
  1929. if (!TreeN->hasOneUse() && TreeN != B)
  1930. return false;
  1931. // Postorder vist.
  1932. if (EdgeToVist == 2 || IsReducedValue) {
  1933. if (IsReducedValue) {
  1934. // Make sure that the opcodes of the operations that we are going to
  1935. // reduce match.
  1936. if (!ReducedValueOpcode)
  1937. ReducedValueOpcode = TreeN->getOpcode();
  1938. else if (ReducedValueOpcode != TreeN->getOpcode())
  1939. return false;
  1940. ReducedVals.push_back(TreeN);
  1941. } else {
  1942. // We need to be able to reassociate the adds.
  1943. if (!TreeN->isAssociative())
  1944. return false;
  1945. ReductionOps.insert(TreeN);
  1946. }
  1947. // Retract.
  1948. Stack.pop_back();
  1949. continue;
  1950. }
  1951. // Visit left or right.
  1952. Value *NextV = TreeN->getOperand(EdgeToVist);
  1953. BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
  1954. if (Next)
  1955. Stack.push_back(std::make_pair(Next, 0));
  1956. else if (NextV != Phi)
  1957. return false;
  1958. }
  1959. return true;
  1960. }
  1961. /// \brief Attempt to vectorize the tree found by
  1962. /// matchAssociativeReduction.
  1963. bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
  1964. if (ReducedVals.empty())
  1965. return false;
  1966. unsigned NumReducedVals = ReducedVals.size();
  1967. if (NumReducedVals < ReduxWidth)
  1968. return false;
  1969. Value *VectorizedTree = 0;
  1970. IRBuilder<> Builder(ReductionRoot);
  1971. FastMathFlags Unsafe;
  1972. Unsafe.setUnsafeAlgebra();
  1973. Builder.SetFastMathFlags(Unsafe);
  1974. unsigned i = 0;
  1975. for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
  1976. ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
  1977. V.buildTree(ValsToReduce, &ReductionOps);
  1978. // Estimate cost.
  1979. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
  1980. if (Cost >= -SLPCostThreshold)
  1981. break;
  1982. DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
  1983. << ". (HorRdx)\n");
  1984. // Vectorize a tree.
  1985. DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
  1986. Value *VectorizedRoot = V.vectorizeTree();
  1987. // Emit a reduction.
  1988. Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
  1989. if (VectorizedTree) {
  1990. Builder.SetCurrentDebugLocation(Loc);
  1991. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  1992. ReducedSubTree, "bin.rdx");
  1993. } else
  1994. VectorizedTree = ReducedSubTree;
  1995. }
  1996. if (VectorizedTree) {
  1997. // Finish the reduction.
  1998. for (; i < NumReducedVals; ++i) {
  1999. Builder.SetCurrentDebugLocation(
  2000. cast<Instruction>(ReducedVals[i])->getDebugLoc());
  2001. VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
  2002. ReducedVals[i]);
  2003. }
  2004. // Update users.
  2005. if (ReductionPHI) {
  2006. assert(ReductionRoot != NULL && "Need a reduction operation");
  2007. ReductionRoot->setOperand(0, VectorizedTree);
  2008. ReductionRoot->setOperand(1, ReductionPHI);
  2009. } else
  2010. ReductionRoot->replaceAllUsesWith(VectorizedTree);
  2011. }
  2012. return VectorizedTree != 0;
  2013. }
  2014. private:
  2015. /// \brief Calcuate the cost of a reduction.
  2016. int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
  2017. Type *ScalarTy = FirstReducedVal->getType();
  2018. Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
  2019. int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
  2020. int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
  2021. IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
  2022. int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
  2023. int ScalarReduxCost =
  2024. ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
  2025. DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
  2026. << " for reduction that starts with " << *FirstReducedVal
  2027. << " (It is a "
  2028. << (IsPairwiseReduction ? "pairwise" : "splitting")
  2029. << " reduction)\n");
  2030. return VecReduxCost - ScalarReduxCost;
  2031. }
  2032. static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
  2033. Value *R, const Twine &Name = "") {
  2034. if (Opcode == Instruction::FAdd)
  2035. return Builder.CreateFAdd(L, R, Name);
  2036. return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
  2037. }
  2038. /// \brief Emit a horizontal reduction of the vectorized value.
  2039. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
  2040. assert(VectorizedValue && "Need to have a vectorized tree node");
  2041. Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
  2042. assert(isPowerOf2_32(ReduxWidth) &&
  2043. "We only handle power-of-two reductions for now");
  2044. Value *TmpVec = ValToReduce;
  2045. for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
  2046. if (IsPairwiseReduction) {
  2047. Value *LeftMask =
  2048. createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
  2049. Value *RightMask =
  2050. createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
  2051. Value *LeftShuf = Builder.CreateShuffleVector(
  2052. TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
  2053. Value *RightShuf = Builder.CreateShuffleVector(
  2054. TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
  2055. "rdx.shuf.r");
  2056. TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
  2057. "bin.rdx");
  2058. } else {
  2059. Value *UpperHalf =
  2060. createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
  2061. Value *Shuf = Builder.CreateShuffleVector(
  2062. TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
  2063. TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
  2064. }
  2065. }
  2066. // The result is in the first element of the vector.
  2067. return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
  2068. }
  2069. };
  2070. /// \brief Recognize construction of vectors like
  2071. /// %ra = insertelement <4 x float> undef, float %s0, i32 0
  2072. /// %rb = insertelement <4 x float> %ra, float %s1, i32 1
  2073. /// %rc = insertelement <4 x float> %rb, float %s2, i32 2
  2074. /// %rd = insertelement <4 x float> %rc, float %s3, i32 3
  2075. ///
  2076. /// Returns true if it matches
  2077. ///
  2078. static bool findBuildVector(InsertElementInst *IE,
  2079. SmallVectorImpl<Value *> &Ops) {
  2080. if (!isa<UndefValue>(IE->getOperand(0)))
  2081. return false;
  2082. while (true) {
  2083. Ops.push_back(IE->getOperand(1));
  2084. if (IE->use_empty())
  2085. return false;
  2086. InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
  2087. if (!NextUse)
  2088. return true;
  2089. // If this isn't the final use, make sure the next insertelement is the only
  2090. // use. It's OK if the final constructed vector is used multiple times
  2091. if (!IE->hasOneUse())
  2092. return false;
  2093. IE = NextUse;
  2094. }
  2095. return false;
  2096. }
  2097. static bool PhiTypeSorterFunc(Value *V, Value *V2) {
  2098. return V->getType() < V2->getType();
  2099. }
  2100. bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
  2101. bool Changed = false;
  2102. SmallVector<Value *, 4> Incoming;
  2103. SmallSet<Value *, 16> VisitedInstrs;
  2104. bool HaveVectorizedPhiNodes = true;
  2105. while (HaveVectorizedPhiNodes) {
  2106. HaveVectorizedPhiNodes = false;
  2107. // Collect the incoming values from the PHIs.
  2108. Incoming.clear();
  2109. for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
  2110. ++instr) {
  2111. PHINode *P = dyn_cast<PHINode>(instr);
  2112. if (!P)
  2113. break;
  2114. if (!VisitedInstrs.count(P))
  2115. Incoming.push_back(P);
  2116. }
  2117. // Sort by type.
  2118. std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
  2119. // Try to vectorize elements base on their type.
  2120. for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
  2121. E = Incoming.end();
  2122. IncIt != E;) {
  2123. // Look for the next elements with the same type.
  2124. SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
  2125. while (SameTypeIt != E &&
  2126. (*SameTypeIt)->getType() == (*IncIt)->getType()) {
  2127. VisitedInstrs.insert(*SameTypeIt);
  2128. ++SameTypeIt;
  2129. }
  2130. // Try to vectorize them.
  2131. unsigned NumElts = (SameTypeIt - IncIt);
  2132. DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
  2133. if (NumElts > 1 &&
  2134. tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
  2135. // Success start over because instructions might have been changed.
  2136. HaveVectorizedPhiNodes = true;
  2137. Changed = true;
  2138. break;
  2139. }
  2140. // Start over at the next instruction of a different type (or the end).
  2141. IncIt = SameTypeIt;
  2142. }
  2143. }
  2144. VisitedInstrs.clear();
  2145. for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
  2146. // We may go through BB multiple times so skip the one we have checked.
  2147. if (!VisitedInstrs.insert(it))
  2148. continue;
  2149. if (isa<DbgInfoIntrinsic>(it))
  2150. continue;
  2151. // Try to vectorize reductions that use PHINodes.
  2152. if (PHINode *P = dyn_cast<PHINode>(it)) {
  2153. // Check that the PHI is a reduction PHI.
  2154. if (P->getNumIncomingValues() != 2)
  2155. return Changed;
  2156. Value *Rdx =
  2157. (P->getIncomingBlock(0) == BB
  2158. ? (P->getIncomingValue(0))
  2159. : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
  2160. // Check if this is a Binary Operator.
  2161. BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
  2162. if (!BI)
  2163. continue;
  2164. // Try to match and vectorize a horizontal reduction.
  2165. HorizontalReduction HorRdx;
  2166. if (ShouldVectorizeHor &&
  2167. HorRdx.matchAssociativeReduction(P, BI, DL) &&
  2168. HorRdx.tryToReduce(R, TTI)) {
  2169. Changed = true;
  2170. it = BB->begin();
  2171. e = BB->end();
  2172. continue;
  2173. }
  2174. Value *Inst = BI->getOperand(0);
  2175. if (Inst == P)
  2176. Inst = BI->getOperand(1);
  2177. if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
  2178. // We would like to start over since some instructions are deleted
  2179. // and the iterator may become invalid value.
  2180. Changed = true;
  2181. it = BB->begin();
  2182. e = BB->end();
  2183. continue;
  2184. }
  2185. continue;
  2186. }
  2187. // Try to vectorize horizontal reductions feeding into a store.
  2188. if (ShouldStartVectorizeHorAtStore)
  2189. if (StoreInst *SI = dyn_cast<StoreInst>(it))
  2190. if (BinaryOperator *BinOp =
  2191. dyn_cast<BinaryOperator>(SI->getValueOperand())) {
  2192. HorizontalReduction HorRdx;
  2193. if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
  2194. HorRdx.tryToReduce(R, TTI)) ||
  2195. tryToVectorize(BinOp, R))) {
  2196. Changed = true;
  2197. it = BB->begin();
  2198. e = BB->end();
  2199. continue;
  2200. }
  2201. }
  2202. // Try to vectorize trees that start at compare instructions.
  2203. if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
  2204. if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
  2205. Changed = true;
  2206. // We would like to start over since some instructions are deleted
  2207. // and the iterator may become invalid value.
  2208. it = BB->begin();
  2209. e = BB->end();
  2210. continue;
  2211. }
  2212. for (int i = 0; i < 2; ++i) {
  2213. if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
  2214. if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
  2215. Changed = true;
  2216. // We would like to start over since some instructions are deleted
  2217. // and the iterator may become invalid value.
  2218. it = BB->begin();
  2219. e = BB->end();
  2220. }
  2221. }
  2222. }
  2223. continue;
  2224. }
  2225. // Try to vectorize trees that start at insertelement instructions.
  2226. if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
  2227. SmallVector<Value *, 8> Ops;
  2228. if (!findBuildVector(IE, Ops))
  2229. continue;
  2230. if (tryToVectorizeList(Ops, R)) {
  2231. Changed = true;
  2232. it = BB->begin();
  2233. e = BB->end();
  2234. }
  2235. continue;
  2236. }
  2237. }
  2238. return Changed;
  2239. }
  2240. bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
  2241. bool Changed = false;
  2242. // Attempt to sort and vectorize each of the store-groups.
  2243. for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
  2244. it != e; ++it) {
  2245. if (it->second.size() < 2)
  2246. continue;
  2247. DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
  2248. << it->second.size() << ".\n");
  2249. // Process the stores in chunks of 16.
  2250. for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
  2251. unsigned Len = std::min<unsigned>(CE - CI, 16);
  2252. ArrayRef<StoreInst *> Chunk(&it->second[CI], Len);
  2253. Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R);
  2254. }
  2255. }
  2256. return Changed;
  2257. }
  2258. } // end anonymous namespace
  2259. char SLPVectorizer::ID = 0;
  2260. static const char lv_name[] = "SLP Vectorizer";
  2261. INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
  2262. INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
  2263. INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
  2264. INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
  2265. INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
  2266. INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
  2267. namespace llvm {
  2268. Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
  2269. }