InterleavedAccessPass.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. //===--------------------- InterleavedAccessPass.cpp ----------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Interleaved Access pass, which identifies
  11. // interleaved memory accesses and transforms them into target specific
  12. // intrinsics.
  13. //
  14. // An interleaved load reads data from memory into several vectors, with
  15. // DE-interleaving the data on a factor. An interleaved store writes several
  16. // vectors to memory with RE-interleaving the data on a factor.
  17. //
  18. // As interleaved accesses are difficult to identified in CodeGen (mainly
  19. // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
  20. // IR), we identify and transform them to intrinsics in this pass so the
  21. // intrinsics can be easily matched into target specific instructions later in
  22. // CodeGen.
  23. //
  24. // E.g. An interleaved load (Factor = 2):
  25. // %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  26. // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  27. // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  28. //
  29. // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
  30. // intrinsic in ARM backend.
  31. //
  32. // In X86, this can be further optimized into a set of target
  33. // specific loads followed by an optimized sequence of shuffles.
  34. //
  35. // E.g. An interleaved store (Factor = 3):
  36. // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  37. // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  38. // store <12 x i32> %i.vec, <12 x i32>* %ptr
  39. //
  40. // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
  41. // intrinsic in ARM backend.
  42. //
  43. // Similarly, a set of interleaved stores can be transformed into an optimized
  44. // sequence of shuffles followed by a set of target specific stores for X86.
  45. //===----------------------------------------------------------------------===//
  46. #include "llvm/CodeGen/Passes.h"
  47. #include "llvm/CodeGen/TargetPassConfig.h"
  48. #include "llvm/IR/Dominators.h"
  49. #include "llvm/IR/InstIterator.h"
  50. #include "llvm/Support/Debug.h"
  51. #include "llvm/Support/MathExtras.h"
  52. #include "llvm/Support/raw_ostream.h"
  53. #include "llvm/Target/TargetLowering.h"
  54. #include "llvm/Target/TargetSubtargetInfo.h"
  55. using namespace llvm;
  56. #define DEBUG_TYPE "interleaved-access"
  57. static cl::opt<bool> LowerInterleavedAccesses(
  58. "lower-interleaved-accesses",
  59. cl::desc("Enable lowering interleaved accesses to intrinsics"),
  60. cl::init(true), cl::Hidden);
  61. namespace {
  62. class InterleavedAccess : public FunctionPass {
  63. public:
  64. static char ID;
  65. InterleavedAccess() : FunctionPass(ID), DT(nullptr), TLI(nullptr) {
  66. initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
  67. }
  68. StringRef getPassName() const override { return "Interleaved Access Pass"; }
  69. bool runOnFunction(Function &F) override;
  70. void getAnalysisUsage(AnalysisUsage &AU) const override {
  71. AU.addRequired<DominatorTreeWrapperPass>();
  72. AU.addPreserved<DominatorTreeWrapperPass>();
  73. }
  74. private:
  75. DominatorTree *DT;
  76. const TargetLowering *TLI;
  77. /// The maximum supported interleave factor.
  78. unsigned MaxFactor;
  79. /// \brief Transform an interleaved load into target specific intrinsics.
  80. bool lowerInterleavedLoad(LoadInst *LI,
  81. SmallVector<Instruction *, 32> &DeadInsts);
  82. /// \brief Transform an interleaved store into target specific intrinsics.
  83. bool lowerInterleavedStore(StoreInst *SI,
  84. SmallVector<Instruction *, 32> &DeadInsts);
  85. /// \brief Returns true if the uses of an interleaved load by the
  86. /// extractelement instructions in \p Extracts can be replaced by uses of the
  87. /// shufflevector instructions in \p Shuffles instead. If so, the necessary
  88. /// replacements are also performed.
  89. bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
  90. ArrayRef<ShuffleVectorInst *> Shuffles);
  91. };
  92. } // end anonymous namespace.
  93. char InterleavedAccess::ID = 0;
  94. INITIALIZE_PASS_BEGIN(
  95. InterleavedAccess, "interleaved-access",
  96. "Lower interleaved memory accesses to target specific intrinsics", false,
  97. false)
  98. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  99. INITIALIZE_PASS_END(
  100. InterleavedAccess, "interleaved-access",
  101. "Lower interleaved memory accesses to target specific intrinsics", false,
  102. false)
  103. FunctionPass *llvm::createInterleavedAccessPass() {
  104. return new InterleavedAccess();
  105. }
  106. /// \brief Check if the mask is a DE-interleave mask of the given factor
  107. /// \p Factor like:
  108. /// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
  109. static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
  110. unsigned &Index) {
  111. // Check all potential start indices from 0 to (Factor - 1).
  112. for (Index = 0; Index < Factor; Index++) {
  113. unsigned i = 0;
  114. // Check that elements are in ascending order by Factor. Ignore undef
  115. // elements.
  116. for (; i < Mask.size(); i++)
  117. if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
  118. break;
  119. if (i == Mask.size())
  120. return true;
  121. }
  122. return false;
  123. }
  124. /// \brief Check if the mask is a DE-interleave mask for an interleaved load.
  125. ///
  126. /// E.g. DE-interleave masks (Factor = 2) could be:
  127. /// <0, 2, 4, 6> (mask of index 0 to extract even elements)
  128. /// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
  129. static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  130. unsigned &Index, unsigned MaxFactor) {
  131. if (Mask.size() < 2)
  132. return false;
  133. // Check potential Factors.
  134. for (Factor = 2; Factor <= MaxFactor; Factor++)
  135. if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
  136. return true;
  137. return false;
  138. }
  139. /// \brief Check if the mask can be used in an interleaved store.
  140. //
  141. /// It checks for a more general pattern than the RE-interleave mask.
  142. /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
  143. /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
  144. /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
  145. /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
  146. ///
  147. /// The particular case of an RE-interleave mask is:
  148. /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
  149. /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
  150. static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  151. unsigned MaxFactor, unsigned OpNumElts) {
  152. unsigned NumElts = Mask.size();
  153. if (NumElts < 4)
  154. return false;
  155. // Check potential Factors.
  156. for (Factor = 2; Factor <= MaxFactor; Factor++) {
  157. if (NumElts % Factor)
  158. continue;
  159. unsigned LaneLen = NumElts / Factor;
  160. if (!isPowerOf2_32(LaneLen))
  161. continue;
  162. // Check whether each element matches the general interleaved rule.
  163. // Ignore undef elements, as long as the defined elements match the rule.
  164. // Outer loop processes all factors (x, y, z in the above example)
  165. unsigned I = 0, J;
  166. for (; I < Factor; I++) {
  167. unsigned SavedLaneValue;
  168. unsigned SavedNoUndefs = 0;
  169. // Inner loop processes consecutive accesses (x, x+1... in the example)
  170. for (J = 0; J < LaneLen - 1; J++) {
  171. // Lane computes x's position in the Mask
  172. unsigned Lane = J * Factor + I;
  173. unsigned NextLane = Lane + Factor;
  174. int LaneValue = Mask[Lane];
  175. int NextLaneValue = Mask[NextLane];
  176. // If both are defined, values must be sequential
  177. if (LaneValue >= 0 && NextLaneValue >= 0 &&
  178. LaneValue + 1 != NextLaneValue)
  179. break;
  180. // If the next value is undef, save the current one as reference
  181. if (LaneValue >= 0 && NextLaneValue < 0) {
  182. SavedLaneValue = LaneValue;
  183. SavedNoUndefs = 1;
  184. }
  185. // Undefs are allowed, but defined elements must still be consecutive:
  186. // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
  187. // Verify this by storing the last non-undef followed by an undef
  188. // Check that following non-undef masks are incremented with the
  189. // corresponding distance.
  190. if (SavedNoUndefs > 0 && LaneValue < 0) {
  191. SavedNoUndefs++;
  192. if (NextLaneValue >= 0 &&
  193. SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
  194. break;
  195. }
  196. }
  197. if (J < LaneLen - 1)
  198. break;
  199. int StartMask = 0;
  200. if (Mask[I] >= 0) {
  201. // Check that the start of the I range (J=0) is greater than 0
  202. StartMask = Mask[I];
  203. } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
  204. // StartMask defined by the last value in lane
  205. StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
  206. } else if (SavedNoUndefs > 0) {
  207. // StartMask defined by some non-zero value in the j loop
  208. StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
  209. }
  210. // else StartMask remains set to 0, i.e. all elements are undefs
  211. if (StartMask < 0)
  212. break;
  213. // We must stay within the vectors; This case can happen with undefs.
  214. if (StartMask + LaneLen > OpNumElts*2)
  215. break;
  216. }
  217. // Found an interleaved mask of current factor.
  218. if (I == Factor)
  219. return true;
  220. }
  221. return false;
  222. }
  223. bool InterleavedAccess::lowerInterleavedLoad(
  224. LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
  225. if (!LI->isSimple())
  226. return false;
  227. SmallVector<ShuffleVectorInst *, 4> Shuffles;
  228. SmallVector<ExtractElementInst *, 4> Extracts;
  229. // Check if all users of this load are shufflevectors. If we encounter any
  230. // users that are extractelement instructions, we save them to later check if
  231. // they can be modifed to extract from one of the shufflevectors instead of
  232. // the load.
  233. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
  234. auto *Extract = dyn_cast<ExtractElementInst>(*UI);
  235. if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
  236. Extracts.push_back(Extract);
  237. continue;
  238. }
  239. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
  240. if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
  241. return false;
  242. Shuffles.push_back(SVI);
  243. }
  244. if (Shuffles.empty())
  245. return false;
  246. unsigned Factor, Index;
  247. // Check if the first shufflevector is DE-interleave shuffle.
  248. if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
  249. MaxFactor))
  250. return false;
  251. // Holds the corresponding index for each DE-interleave shuffle.
  252. SmallVector<unsigned, 4> Indices;
  253. Indices.push_back(Index);
  254. Type *VecTy = Shuffles[0]->getType();
  255. // Check if other shufflevectors are also DE-interleaved of the same type
  256. // and factor as the first shufflevector.
  257. for (unsigned i = 1; i < Shuffles.size(); i++) {
  258. if (Shuffles[i]->getType() != VecTy)
  259. return false;
  260. if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
  261. Index))
  262. return false;
  263. Indices.push_back(Index);
  264. }
  265. // Try and modify users of the load that are extractelement instructions to
  266. // use the shufflevector instructions instead of the load.
  267. if (!tryReplaceExtracts(Extracts, Shuffles))
  268. return false;
  269. DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
  270. // Try to create target specific intrinsics to replace the load and shuffles.
  271. if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
  272. return false;
  273. for (auto SVI : Shuffles)
  274. DeadInsts.push_back(SVI);
  275. DeadInsts.push_back(LI);
  276. return true;
  277. }
  278. bool InterleavedAccess::tryReplaceExtracts(
  279. ArrayRef<ExtractElementInst *> Extracts,
  280. ArrayRef<ShuffleVectorInst *> Shuffles) {
  281. // If there aren't any extractelement instructions to modify, there's nothing
  282. // to do.
  283. if (Extracts.empty())
  284. return true;
  285. // Maps extractelement instructions to vector-index pairs. The extractlement
  286. // instructions will be modified to use the new vector and index operands.
  287. DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
  288. for (auto *Extract : Extracts) {
  289. // The vector index that is extracted.
  290. auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
  291. auto Index = IndexOperand->getSExtValue();
  292. // Look for a suitable shufflevector instruction. The goal is to modify the
  293. // extractelement instruction (which uses an interleaved load) to use one
  294. // of the shufflevector instructions instead of the load.
  295. for (auto *Shuffle : Shuffles) {
  296. // If the shufflevector instruction doesn't dominate the extract, we
  297. // can't create a use of it.
  298. if (!DT->dominates(Shuffle, Extract))
  299. continue;
  300. // Inspect the indices of the shufflevector instruction. If the shuffle
  301. // selects the same index that is extracted, we can modify the
  302. // extractelement instruction.
  303. SmallVector<int, 4> Indices;
  304. Shuffle->getShuffleMask(Indices);
  305. for (unsigned I = 0; I < Indices.size(); ++I)
  306. if (Indices[I] == Index) {
  307. assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
  308. "Vector operations do not match");
  309. ReplacementMap[Extract] = std::make_pair(Shuffle, I);
  310. break;
  311. }
  312. // If we found a suitable shufflevector instruction, stop looking.
  313. if (ReplacementMap.count(Extract))
  314. break;
  315. }
  316. // If we did not find a suitable shufflevector instruction, the
  317. // extractelement instruction cannot be modified, so we must give up.
  318. if (!ReplacementMap.count(Extract))
  319. return false;
  320. }
  321. // Finally, perform the replacements.
  322. IRBuilder<> Builder(Extracts[0]->getContext());
  323. for (auto &Replacement : ReplacementMap) {
  324. auto *Extract = Replacement.first;
  325. auto *Vector = Replacement.second.first;
  326. auto Index = Replacement.second.second;
  327. Builder.SetInsertPoint(Extract);
  328. Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
  329. Extract->eraseFromParent();
  330. }
  331. return true;
  332. }
  333. bool InterleavedAccess::lowerInterleavedStore(
  334. StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
  335. if (!SI->isSimple())
  336. return false;
  337. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
  338. if (!SVI || !SVI->hasOneUse())
  339. return false;
  340. // Check if the shufflevector is RE-interleave shuffle.
  341. unsigned Factor;
  342. unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
  343. if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
  344. return false;
  345. DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
  346. // Try to create target specific intrinsics to replace the store and shuffle.
  347. if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
  348. return false;
  349. // Already have a new target specific interleaved store. Erase the old store.
  350. DeadInsts.push_back(SI);
  351. DeadInsts.push_back(SVI);
  352. return true;
  353. }
  354. bool InterleavedAccess::runOnFunction(Function &F) {
  355. auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
  356. if (!TPC || !LowerInterleavedAccesses)
  357. return false;
  358. DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
  359. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  360. auto &TM = TPC->getTM<TargetMachine>();
  361. TLI = TM.getSubtargetImpl(F)->getTargetLowering();
  362. MaxFactor = TLI->getMaxSupportedInterleaveFactor();
  363. // Holds dead instructions that will be erased later.
  364. SmallVector<Instruction *, 32> DeadInsts;
  365. bool Changed = false;
  366. for (auto &I : instructions(F)) {
  367. if (LoadInst *LI = dyn_cast<LoadInst>(&I))
  368. Changed |= lowerInterleavedLoad(LI, DeadInsts);
  369. if (StoreInst *SI = dyn_cast<StoreInst>(&I))
  370. Changed |= lowerInterleavedStore(SI, DeadInsts);
  371. }
  372. for (auto I : DeadInsts)
  373. I->eraseFromParent();
  374. return Changed;
  375. }