InterleavedAccessPass.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. //===--------------------- InterleavedAccessPass.cpp ----------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Interleaved Access pass, which identifies
  11. // interleaved memory accesses and transforms them into target specific
  12. // intrinsics.
  13. //
  14. // An interleaved load reads data from memory into several vectors, with
  15. // DE-interleaving the data on a factor. An interleaved store writes several
  16. // vectors to memory with RE-interleaving the data on a factor.
  17. //
  18. // As interleaved accesses are difficult to identified in CodeGen (mainly
  19. // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
  20. // IR), we identify and transform them to intrinsics in this pass so the
  21. // intrinsics can be easily matched into target specific instructions later in
  22. // CodeGen.
  23. //
  24. // E.g. An interleaved load (Factor = 2):
  25. // %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  26. // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  27. // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  28. //
  29. // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
  30. // intrinsic in ARM backend.
  31. //
  32. // In X86, this can be further optimized into a set of target
  33. // specific loads followed by an optimized sequence of shuffles.
  34. //
  35. // E.g. An interleaved store (Factor = 3):
  36. // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  37. // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  38. // store <12 x i32> %i.vec, <12 x i32>* %ptr
  39. //
  40. // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
  41. // intrinsic in ARM backend.
  42. //
  43. // Similarly, a set of interleaved stores can be transformed into an optimized
  44. // sequence of shuffles followed by a set of target specific stores for X86.
  45. //===----------------------------------------------------------------------===//
  46. #include "llvm/CodeGen/Passes.h"
  47. #include "llvm/IR/Dominators.h"
  48. #include "llvm/IR/InstIterator.h"
  49. #include "llvm/Support/Debug.h"
  50. #include "llvm/Support/MathExtras.h"
  51. #include "llvm/Support/raw_ostream.h"
  52. #include "llvm/Target/TargetLowering.h"
  53. #include "llvm/Target/TargetSubtargetInfo.h"
  54. using namespace llvm;
  55. #define DEBUG_TYPE "interleaved-access"
  56. static cl::opt<bool> LowerInterleavedAccesses(
  57. "lower-interleaved-accesses",
  58. cl::desc("Enable lowering interleaved accesses to intrinsics"),
  59. cl::init(true), cl::Hidden);
  60. static unsigned MaxFactor; // The maximum supported interleave factor.
  61. namespace {
  62. class InterleavedAccess : public FunctionPass {
  63. public:
  64. static char ID;
  65. InterleavedAccess(const TargetMachine *TM = nullptr)
  66. : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
  67. initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
  68. }
  69. StringRef getPassName() const override { return "Interleaved Access Pass"; }
  70. bool runOnFunction(Function &F) override;
  71. void getAnalysisUsage(AnalysisUsage &AU) const override {
  72. AU.addRequired<DominatorTreeWrapperPass>();
  73. AU.addPreserved<DominatorTreeWrapperPass>();
  74. }
  75. private:
  76. DominatorTree *DT;
  77. const TargetMachine *TM;
  78. const TargetLowering *TLI;
  79. /// \brief Transform an interleaved load into target specific intrinsics.
  80. bool lowerInterleavedLoad(LoadInst *LI,
  81. SmallVector<Instruction *, 32> &DeadInsts);
  82. /// \brief Transform an interleaved store into target specific intrinsics.
  83. bool lowerInterleavedStore(StoreInst *SI,
  84. SmallVector<Instruction *, 32> &DeadInsts);
  85. /// \brief Returns true if the uses of an interleaved load by the
  86. /// extractelement instructions in \p Extracts can be replaced by uses of the
  87. /// shufflevector instructions in \p Shuffles instead. If so, the necessary
  88. /// replacements are also performed.
  89. bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
  90. ArrayRef<ShuffleVectorInst *> Shuffles);
  91. };
  92. } // end anonymous namespace.
  93. char InterleavedAccess::ID = 0;
  94. INITIALIZE_TM_PASS_BEGIN(
  95. InterleavedAccess, "interleaved-access",
  96. "Lower interleaved memory accesses to target specific intrinsics", false,
  97. false)
  98. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  99. INITIALIZE_TM_PASS_END(
  100. InterleavedAccess, "interleaved-access",
  101. "Lower interleaved memory accesses to target specific intrinsics", false,
  102. false)
  103. FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
  104. return new InterleavedAccess(TM);
  105. }
  106. /// \brief Check if the mask is a DE-interleave mask of the given factor
  107. /// \p Factor like:
  108. /// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
  109. static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
  110. unsigned &Index) {
  111. // Check all potential start indices from 0 to (Factor - 1).
  112. for (Index = 0; Index < Factor; Index++) {
  113. unsigned i = 0;
  114. // Check that elements are in ascending order by Factor. Ignore undef
  115. // elements.
  116. for (; i < Mask.size(); i++)
  117. if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
  118. break;
  119. if (i == Mask.size())
  120. return true;
  121. }
  122. return false;
  123. }
  124. /// \brief Check if the mask is a DE-interleave mask for an interleaved load.
  125. ///
  126. /// E.g. DE-interleave masks (Factor = 2) could be:
  127. /// <0, 2, 4, 6> (mask of index 0 to extract even elements)
  128. /// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
  129. static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  130. unsigned &Index) {
  131. if (Mask.size() < 2)
  132. return false;
  133. // Check potential Factors.
  134. for (Factor = 2; Factor <= MaxFactor; Factor++)
  135. if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
  136. return true;
  137. return false;
  138. }
  139. /// \brief Check if the mask is RE-interleave mask for an interleaved store.
  140. ///
  141. /// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
  142. ///
  143. /// E.g. The RE-interleave mask (Factor = 2) could be:
  144. /// <0, 4, 1, 5, 2, 6, 3, 7>
  145. static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
  146. unsigned NumElts = Mask.size();
  147. if (NumElts < 4)
  148. return false;
  149. // Check potential Factors.
  150. for (Factor = 2; Factor <= MaxFactor; Factor++) {
  151. if (NumElts % Factor)
  152. continue;
  153. unsigned NumSubElts = NumElts / Factor;
  154. if (!isPowerOf2_32(NumSubElts))
  155. continue;
  156. // Check whether each element matchs the RE-interleaved rule. Ignore undef
  157. // elements.
  158. unsigned i = 0;
  159. for (; i < NumElts; i++)
  160. if (Mask[i] >= 0 &&
  161. static_cast<unsigned>(Mask[i]) !=
  162. (i % Factor) * NumSubElts + i / Factor)
  163. break;
  164. // Find a RE-interleaved mask of current factor.
  165. if (i == NumElts)
  166. return true;
  167. }
  168. return false;
  169. }
  170. bool InterleavedAccess::lowerInterleavedLoad(
  171. LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
  172. if (!LI->isSimple())
  173. return false;
  174. SmallVector<ShuffleVectorInst *, 4> Shuffles;
  175. SmallVector<ExtractElementInst *, 4> Extracts;
  176. // Check if all users of this load are shufflevectors. If we encounter any
  177. // users that are extractelement instructions, we save them to later check if
  178. // they can be modifed to extract from one of the shufflevectors instead of
  179. // the load.
  180. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
  181. auto *Extract = dyn_cast<ExtractElementInst>(*UI);
  182. if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
  183. Extracts.push_back(Extract);
  184. continue;
  185. }
  186. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
  187. if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
  188. return false;
  189. Shuffles.push_back(SVI);
  190. }
  191. if (Shuffles.empty())
  192. return false;
  193. unsigned Factor, Index;
  194. // Check if the first shufflevector is DE-interleave shuffle.
  195. if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
  196. return false;
  197. // Holds the corresponding index for each DE-interleave shuffle.
  198. SmallVector<unsigned, 4> Indices;
  199. Indices.push_back(Index);
  200. Type *VecTy = Shuffles[0]->getType();
  201. // Check if other shufflevectors are also DE-interleaved of the same type
  202. // and factor as the first shufflevector.
  203. for (unsigned i = 1; i < Shuffles.size(); i++) {
  204. if (Shuffles[i]->getType() != VecTy)
  205. return false;
  206. if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
  207. Index))
  208. return false;
  209. Indices.push_back(Index);
  210. }
  211. // Try and modify users of the load that are extractelement instructions to
  212. // use the shufflevector instructions instead of the load.
  213. if (!tryReplaceExtracts(Extracts, Shuffles))
  214. return false;
  215. DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
  216. // Try to create target specific intrinsics to replace the load and shuffles.
  217. if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
  218. return false;
  219. for (auto SVI : Shuffles)
  220. DeadInsts.push_back(SVI);
  221. DeadInsts.push_back(LI);
  222. return true;
  223. }
  224. bool InterleavedAccess::tryReplaceExtracts(
  225. ArrayRef<ExtractElementInst *> Extracts,
  226. ArrayRef<ShuffleVectorInst *> Shuffles) {
  227. // If there aren't any extractelement instructions to modify, there's nothing
  228. // to do.
  229. if (Extracts.empty())
  230. return true;
  231. // Maps extractelement instructions to vector-index pairs. The extractlement
  232. // instructions will be modified to use the new vector and index operands.
  233. DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
  234. for (auto *Extract : Extracts) {
  235. // The vector index that is extracted.
  236. auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
  237. auto Index = IndexOperand->getSExtValue();
  238. // Look for a suitable shufflevector instruction. The goal is to modify the
  239. // extractelement instruction (which uses an interleaved load) to use one
  240. // of the shufflevector instructions instead of the load.
  241. for (auto *Shuffle : Shuffles) {
  242. // If the shufflevector instruction doesn't dominate the extract, we
  243. // can't create a use of it.
  244. if (!DT->dominates(Shuffle, Extract))
  245. continue;
  246. // Inspect the indices of the shufflevector instruction. If the shuffle
  247. // selects the same index that is extracted, we can modify the
  248. // extractelement instruction.
  249. SmallVector<int, 4> Indices;
  250. Shuffle->getShuffleMask(Indices);
  251. for (unsigned I = 0; I < Indices.size(); ++I)
  252. if (Indices[I] == Index) {
  253. assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
  254. "Vector operations do not match");
  255. ReplacementMap[Extract] = std::make_pair(Shuffle, I);
  256. break;
  257. }
  258. // If we found a suitable shufflevector instruction, stop looking.
  259. if (ReplacementMap.count(Extract))
  260. break;
  261. }
  262. // If we did not find a suitable shufflevector instruction, the
  263. // extractelement instruction cannot be modified, so we must give up.
  264. if (!ReplacementMap.count(Extract))
  265. return false;
  266. }
  267. // Finally, perform the replacements.
  268. IRBuilder<> Builder(Extracts[0]->getContext());
  269. for (auto &Replacement : ReplacementMap) {
  270. auto *Extract = Replacement.first;
  271. auto *Vector = Replacement.second.first;
  272. auto Index = Replacement.second.second;
  273. Builder.SetInsertPoint(Extract);
  274. Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
  275. Extract->eraseFromParent();
  276. }
  277. return true;
  278. }
  279. bool InterleavedAccess::lowerInterleavedStore(
  280. StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
  281. if (!SI->isSimple())
  282. return false;
  283. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
  284. if (!SVI || !SVI->hasOneUse())
  285. return false;
  286. // Check if the shufflevector is RE-interleave shuffle.
  287. unsigned Factor;
  288. if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
  289. return false;
  290. DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
  291. // Try to create target specific intrinsics to replace the store and shuffle.
  292. if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
  293. return false;
  294. // Already have a new target specific interleaved store. Erase the old store.
  295. DeadInsts.push_back(SI);
  296. DeadInsts.push_back(SVI);
  297. return true;
  298. }
  299. bool InterleavedAccess::runOnFunction(Function &F) {
  300. if (!TM || !LowerInterleavedAccesses)
  301. return false;
  302. DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
  303. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  304. TLI = TM->getSubtargetImpl(F)->getTargetLowering();
  305. MaxFactor = TLI->getMaxSupportedInterleaveFactor();
  306. // Holds dead instructions that will be erased later.
  307. SmallVector<Instruction *, 32> DeadInsts;
  308. bool Changed = false;
  309. for (auto &I : instructions(F)) {
  310. if (LoadInst *LI = dyn_cast<LoadInst>(&I))
  311. Changed |= lowerInterleavedLoad(LI, DeadInsts);
  312. if (StoreInst *SI = dyn_cast<StoreInst>(&I))
  313. Changed |= lowerInterleavedStore(SI, DeadInsts);
  314. }
  315. for (auto I : DeadInsts)
  316. I->eraseFromParent();
  317. return Changed;
  318. }