InterleavedAccessPass.cpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. //===--------------------- InterleavedAccessPass.cpp ----------------------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. // This file implements the Interleaved Access pass, which identifies
  11. // interleaved memory accesses and transforms them into target specific
  12. // intrinsics.
  13. //
  14. // An interleaved load reads data from memory into several vectors, with
  15. // DE-interleaving the data on a factor. An interleaved store writes several
  16. // vectors to memory with RE-interleaving the data on a factor.
  17. //
  18. // As interleaved accesses are difficult to identified in CodeGen (mainly
  19. // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
  20. // IR), we identify and transform them to intrinsics in this pass so the
  21. // intrinsics can be easily matched into target specific instructions later in
  22. // CodeGen.
  23. //
  24. // E.g. An interleaved load (Factor = 2):
  25. // %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  26. // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  27. // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  28. //
  29. // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
  30. // intrinsic in ARM backend.
  31. //
  32. // E.g. An interleaved store (Factor = 3):
  33. // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  34. // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  35. // store <12 x i32> %i.vec, <12 x i32>* %ptr
  36. //
  37. // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
  38. // intrinsic in ARM backend.
  39. //
  40. //===----------------------------------------------------------------------===//
  41. #include "llvm/CodeGen/Passes.h"
  42. #include "llvm/IR/Dominators.h"
  43. #include "llvm/IR/InstIterator.h"
  44. #include "llvm/Support/Debug.h"
  45. #include "llvm/Support/MathExtras.h"
  46. #include "llvm/Support/raw_ostream.h"
  47. #include "llvm/Target/TargetLowering.h"
  48. #include "llvm/Target/TargetSubtargetInfo.h"
  49. using namespace llvm;
  50. #define DEBUG_TYPE "interleaved-access"
  51. static cl::opt<bool> LowerInterleavedAccesses(
  52. "lower-interleaved-accesses",
  53. cl::desc("Enable lowering interleaved accesses to intrinsics"),
  54. cl::init(true), cl::Hidden);
  55. static unsigned MaxFactor; // The maximum supported interleave factor.
  56. namespace {
  57. class InterleavedAccess : public FunctionPass {
  58. public:
  59. static char ID;
  60. InterleavedAccess(const TargetMachine *TM = nullptr)
  61. : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
  62. initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
  63. }
  64. const char *getPassName() const override { return "Interleaved Access Pass"; }
  65. bool runOnFunction(Function &F) override;
  66. void getAnalysisUsage(AnalysisUsage &AU) const override {
  67. AU.addRequired<DominatorTreeWrapperPass>();
  68. AU.addPreserved<DominatorTreeWrapperPass>();
  69. }
  70. private:
  71. DominatorTree *DT;
  72. const TargetMachine *TM;
  73. const TargetLowering *TLI;
  74. /// \brief Transform an interleaved load into target specific intrinsics.
  75. bool lowerInterleavedLoad(LoadInst *LI,
  76. SmallVector<Instruction *, 32> &DeadInsts);
  77. /// \brief Transform an interleaved store into target specific intrinsics.
  78. bool lowerInterleavedStore(StoreInst *SI,
  79. SmallVector<Instruction *, 32> &DeadInsts);
  80. /// \brief Returns true if the uses of an interleaved load by the
  81. /// extractelement instructions in \p Extracts can be replaced by uses of the
  82. /// shufflevector instructions in \p Shuffles instead. If so, the necessary
  83. /// replacements are also performed.
  84. bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
  85. ArrayRef<ShuffleVectorInst *> Shuffles);
  86. };
  87. } // end anonymous namespace.
  88. char InterleavedAccess::ID = 0;
  89. INITIALIZE_TM_PASS_BEGIN(
  90. InterleavedAccess, "interleaved-access",
  91. "Lower interleaved memory accesses to target specific intrinsics", false,
  92. false)
  93. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  94. INITIALIZE_TM_PASS_END(
  95. InterleavedAccess, "interleaved-access",
  96. "Lower interleaved memory accesses to target specific intrinsics", false,
  97. false)
  98. FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
  99. return new InterleavedAccess(TM);
  100. }
  101. /// \brief Check if the mask is a DE-interleave mask of the given factor
  102. /// \p Factor like:
  103. /// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
  104. static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
  105. unsigned &Index) {
  106. // Check all potential start indices from 0 to (Factor - 1).
  107. for (Index = 0; Index < Factor; Index++) {
  108. unsigned i = 0;
  109. // Check that elements are in ascending order by Factor. Ignore undef
  110. // elements.
  111. for (; i < Mask.size(); i++)
  112. if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
  113. break;
  114. if (i == Mask.size())
  115. return true;
  116. }
  117. return false;
  118. }
  119. /// \brief Check if the mask is a DE-interleave mask for an interleaved load.
  120. ///
  121. /// E.g. DE-interleave masks (Factor = 2) could be:
  122. /// <0, 2, 4, 6> (mask of index 0 to extract even elements)
  123. /// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
  124. static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  125. unsigned &Index) {
  126. if (Mask.size() < 2)
  127. return false;
  128. // Check potential Factors.
  129. for (Factor = 2; Factor <= MaxFactor; Factor++)
  130. if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
  131. return true;
  132. return false;
  133. }
  134. /// \brief Check if the mask is RE-interleave mask for an interleaved store.
  135. ///
  136. /// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
  137. ///
  138. /// E.g. The RE-interleave mask (Factor = 2) could be:
  139. /// <0, 4, 1, 5, 2, 6, 3, 7>
  140. static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
  141. unsigned NumElts = Mask.size();
  142. if (NumElts < 4)
  143. return false;
  144. // Check potential Factors.
  145. for (Factor = 2; Factor <= MaxFactor; Factor++) {
  146. if (NumElts % Factor)
  147. continue;
  148. unsigned NumSubElts = NumElts / Factor;
  149. if (!isPowerOf2_32(NumSubElts))
  150. continue;
  151. // Check whether each element matchs the RE-interleaved rule. Ignore undef
  152. // elements.
  153. unsigned i = 0;
  154. for (; i < NumElts; i++)
  155. if (Mask[i] >= 0 &&
  156. static_cast<unsigned>(Mask[i]) !=
  157. (i % Factor) * NumSubElts + i / Factor)
  158. break;
  159. // Find a RE-interleaved mask of current factor.
  160. if (i == NumElts)
  161. return true;
  162. }
  163. return false;
  164. }
  165. bool InterleavedAccess::lowerInterleavedLoad(
  166. LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
  167. if (!LI->isSimple())
  168. return false;
  169. SmallVector<ShuffleVectorInst *, 4> Shuffles;
  170. SmallVector<ExtractElementInst *, 4> Extracts;
  171. // Check if all users of this load are shufflevectors. If we encounter any
  172. // users that are extractelement instructions, we save them to later check if
  173. // they can be modifed to extract from one of the shufflevectors instead of
  174. // the load.
  175. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
  176. auto *Extract = dyn_cast<ExtractElementInst>(*UI);
  177. if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
  178. Extracts.push_back(Extract);
  179. continue;
  180. }
  181. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
  182. if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
  183. return false;
  184. Shuffles.push_back(SVI);
  185. }
  186. if (Shuffles.empty())
  187. return false;
  188. unsigned Factor, Index;
  189. // Check if the first shufflevector is DE-interleave shuffle.
  190. if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
  191. return false;
  192. // Holds the corresponding index for each DE-interleave shuffle.
  193. SmallVector<unsigned, 4> Indices;
  194. Indices.push_back(Index);
  195. Type *VecTy = Shuffles[0]->getType();
  196. // Check if other shufflevectors are also DE-interleaved of the same type
  197. // and factor as the first shufflevector.
  198. for (unsigned i = 1; i < Shuffles.size(); i++) {
  199. if (Shuffles[i]->getType() != VecTy)
  200. return false;
  201. if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
  202. Index))
  203. return false;
  204. Indices.push_back(Index);
  205. }
  206. // Try and modify users of the load that are extractelement instructions to
  207. // use the shufflevector instructions instead of the load.
  208. if (!tryReplaceExtracts(Extracts, Shuffles))
  209. return false;
  210. DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
  211. // Try to create target specific intrinsics to replace the load and shuffles.
  212. if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
  213. return false;
  214. for (auto SVI : Shuffles)
  215. DeadInsts.push_back(SVI);
  216. DeadInsts.push_back(LI);
  217. return true;
  218. }
  219. bool InterleavedAccess::tryReplaceExtracts(
  220. ArrayRef<ExtractElementInst *> Extracts,
  221. ArrayRef<ShuffleVectorInst *> Shuffles) {
  222. // If there aren't any extractelement instructions to modify, there's nothing
  223. // to do.
  224. if (Extracts.empty())
  225. return true;
  226. // Maps extractelement instructions to vector-index pairs. The extractlement
  227. // instructions will be modified to use the new vector and index operands.
  228. DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
  229. for (auto *Extract : Extracts) {
  230. // The vector index that is extracted.
  231. auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
  232. auto Index = IndexOperand->getSExtValue();
  233. // Look for a suitable shufflevector instruction. The goal is to modify the
  234. // extractelement instruction (which uses an interleaved load) to use one
  235. // of the shufflevector instructions instead of the load.
  236. for (auto *Shuffle : Shuffles) {
  237. // If the shufflevector instruction doesn't dominate the extract, we
  238. // can't create a use of it.
  239. if (!DT->dominates(Shuffle, Extract))
  240. continue;
  241. // Inspect the indices of the shufflevector instruction. If the shuffle
  242. // selects the same index that is extracted, we can modify the
  243. // extractelement instruction.
  244. SmallVector<int, 4> Indices;
  245. Shuffle->getShuffleMask(Indices);
  246. for (unsigned I = 0; I < Indices.size(); ++I)
  247. if (Indices[I] == Index) {
  248. assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
  249. "Vector operations do not match");
  250. ReplacementMap[Extract] = std::make_pair(Shuffle, I);
  251. break;
  252. }
  253. // If we found a suitable shufflevector instruction, stop looking.
  254. if (ReplacementMap.count(Extract))
  255. break;
  256. }
  257. // If we did not find a suitable shufflevector instruction, the
  258. // extractelement instruction cannot be modified, so we must give up.
  259. if (!ReplacementMap.count(Extract))
  260. return false;
  261. }
  262. // Finally, perform the replacements.
  263. IRBuilder<> Builder(Extracts[0]->getContext());
  264. for (auto &Replacement : ReplacementMap) {
  265. auto *Extract = Replacement.first;
  266. auto *Vector = Replacement.second.first;
  267. auto Index = Replacement.second.second;
  268. Builder.SetInsertPoint(Extract);
  269. Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
  270. Extract->eraseFromParent();
  271. }
  272. return true;
  273. }
  274. bool InterleavedAccess::lowerInterleavedStore(
  275. StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
  276. if (!SI->isSimple())
  277. return false;
  278. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
  279. if (!SVI || !SVI->hasOneUse())
  280. return false;
  281. // Check if the shufflevector is RE-interleave shuffle.
  282. unsigned Factor;
  283. if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
  284. return false;
  285. DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
  286. // Try to create target specific intrinsics to replace the store and shuffle.
  287. if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
  288. return false;
  289. // Already have a new target specific interleaved store. Erase the old store.
  290. DeadInsts.push_back(SI);
  291. DeadInsts.push_back(SVI);
  292. return true;
  293. }
  294. bool InterleavedAccess::runOnFunction(Function &F) {
  295. if (!TM || !LowerInterleavedAccesses)
  296. return false;
  297. DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
  298. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  299. TLI = TM->getSubtargetImpl(F)->getTargetLowering();
  300. MaxFactor = TLI->getMaxSupportedInterleaveFactor();
  301. // Holds dead instructions that will be erased later.
  302. SmallVector<Instruction *, 32> DeadInsts;
  303. bool Changed = false;
  304. for (auto &I : instructions(F)) {
  305. if (LoadInst *LI = dyn_cast<LoadInst>(&I))
  306. Changed |= lowerInterleavedLoad(LI, DeadInsts);
  307. if (StoreInst *SI = dyn_cast<StoreInst>(&I))
  308. Changed |= lowerInterleavedStore(SI, DeadInsts);
  309. }
  310. for (auto I : DeadInsts)
  311. I->eraseFromParent();
  312. return Changed;
  313. }