InterleavedAccessPass.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. //===- InterleavedAccessPass.cpp ------------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file implements the Interleaved Access pass, which identifies
  10. // interleaved memory accesses and transforms them into target specific
  11. // intrinsics.
  12. //
  13. // An interleaved load reads data from memory into several vectors, with
  14. // DE-interleaving the data on a factor. An interleaved store writes several
  15. // vectors to memory with RE-interleaving the data on a factor.
  16. //
  17. // As interleaved accesses are difficult to identified in CodeGen (mainly
  18. // because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
  19. // IR), we identify and transform them to intrinsics in this pass so the
  20. // intrinsics can be easily matched into target specific instructions later in
  21. // CodeGen.
  22. //
  23. // E.g. An interleaved load (Factor = 2):
  24. // %wide.vec = load <8 x i32>, <8 x i32>* %ptr
  25. // %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
  26. // %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
  27. //
  28. // It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
  29. // intrinsic in ARM backend.
  30. //
  31. // In X86, this can be further optimized into a set of target
  32. // specific loads followed by an optimized sequence of shuffles.
  33. //
  34. // E.g. An interleaved store (Factor = 3):
  35. // %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
  36. // <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
  37. // store <12 x i32> %i.vec, <12 x i32>* %ptr
  38. //
  39. // It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
  40. // intrinsic in ARM backend.
  41. //
  42. // Similarly, a set of interleaved stores can be transformed into an optimized
  43. // sequence of shuffles followed by a set of target specific stores for X86.
  44. //
  45. //===----------------------------------------------------------------------===//
  46. #include "llvm/ADT/ArrayRef.h"
  47. #include "llvm/ADT/DenseMap.h"
  48. #include "llvm/ADT/SmallVector.h"
  49. #include "llvm/CodeGen/TargetLowering.h"
  50. #include "llvm/CodeGen/TargetPassConfig.h"
  51. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  52. #include "llvm/IR/Constants.h"
  53. #include "llvm/IR/Dominators.h"
  54. #include "llvm/IR/Function.h"
  55. #include "llvm/IR/IRBuilder.h"
  56. #include "llvm/IR/InstIterator.h"
  57. #include "llvm/IR/Instruction.h"
  58. #include "llvm/IR/Instructions.h"
  59. #include "llvm/IR/Type.h"
  60. #include "llvm/Pass.h"
  61. #include "llvm/Support/Casting.h"
  62. #include "llvm/Support/CommandLine.h"
  63. #include "llvm/Support/Debug.h"
  64. #include "llvm/Support/MathExtras.h"
  65. #include "llvm/Support/raw_ostream.h"
  66. #include "llvm/Target/TargetMachine.h"
  67. #include <cassert>
  68. #include <utility>
  69. using namespace llvm;
  70. #define DEBUG_TYPE "interleaved-access"
  71. static cl::opt<bool> LowerInterleavedAccesses(
  72. "lower-interleaved-accesses",
  73. cl::desc("Enable lowering interleaved accesses to intrinsics"),
  74. cl::init(true), cl::Hidden);
  75. namespace {
  76. class InterleavedAccess : public FunctionPass {
  77. public:
  78. static char ID;
  79. InterleavedAccess() : FunctionPass(ID) {
  80. initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
  81. }
  82. StringRef getPassName() const override { return "Interleaved Access Pass"; }
  83. bool runOnFunction(Function &F) override;
  84. void getAnalysisUsage(AnalysisUsage &AU) const override {
  85. AU.addRequired<DominatorTreeWrapperPass>();
  86. AU.addPreserved<DominatorTreeWrapperPass>();
  87. }
  88. private:
  89. DominatorTree *DT = nullptr;
  90. const TargetLowering *TLI = nullptr;
  91. /// The maximum supported interleave factor.
  92. unsigned MaxFactor;
  93. /// Transform an interleaved load into target specific intrinsics.
  94. bool lowerInterleavedLoad(LoadInst *LI,
  95. SmallVector<Instruction *, 32> &DeadInsts);
  96. /// Transform an interleaved store into target specific intrinsics.
  97. bool lowerInterleavedStore(StoreInst *SI,
  98. SmallVector<Instruction *, 32> &DeadInsts);
  99. /// Returns true if the uses of an interleaved load by the
  100. /// extractelement instructions in \p Extracts can be replaced by uses of the
  101. /// shufflevector instructions in \p Shuffles instead. If so, the necessary
  102. /// replacements are also performed.
  103. bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
  104. ArrayRef<ShuffleVectorInst *> Shuffles);
  105. };
  106. } // end anonymous namespace.
  107. char InterleavedAccess::ID = 0;
  108. INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
  109. "Lower interleaved memory accesses to target specific intrinsics", false,
  110. false)
  111. INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
  112. INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
  113. "Lower interleaved memory accesses to target specific intrinsics", false,
  114. false)
  115. FunctionPass *llvm::createInterleavedAccessPass() {
  116. return new InterleavedAccess();
  117. }
  118. /// Check if the mask is a DE-interleave mask of the given factor
  119. /// \p Factor like:
  120. /// <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
  121. static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
  122. unsigned &Index) {
  123. // Check all potential start indices from 0 to (Factor - 1).
  124. for (Index = 0; Index < Factor; Index++) {
  125. unsigned i = 0;
  126. // Check that elements are in ascending order by Factor. Ignore undef
  127. // elements.
  128. for (; i < Mask.size(); i++)
  129. if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
  130. break;
  131. if (i == Mask.size())
  132. return true;
  133. }
  134. return false;
  135. }
  136. /// Check if the mask is a DE-interleave mask for an interleaved load.
  137. ///
  138. /// E.g. DE-interleave masks (Factor = 2) could be:
  139. /// <0, 2, 4, 6> (mask of index 0 to extract even elements)
  140. /// <1, 3, 5, 7> (mask of index 1 to extract odd elements)
  141. static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  142. unsigned &Index, unsigned MaxFactor,
  143. unsigned NumLoadElements) {
  144. if (Mask.size() < 2)
  145. return false;
  146. // Check potential Factors.
  147. for (Factor = 2; Factor <= MaxFactor; Factor++) {
  148. // Make sure we don't produce a load wider than the input load.
  149. if (Mask.size() * Factor > NumLoadElements)
  150. return false;
  151. if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
  152. return true;
  153. }
  154. return false;
  155. }
  156. /// Check if the mask can be used in an interleaved store.
  157. //
  158. /// It checks for a more general pattern than the RE-interleave mask.
  159. /// I.e. <x, y, ... z, x+1, y+1, ...z+1, x+2, y+2, ...z+2, ...>
  160. /// E.g. For a Factor of 2 (LaneLen=4): <4, 32, 5, 33, 6, 34, 7, 35>
  161. /// E.g. For a Factor of 3 (LaneLen=4): <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
  162. /// E.g. For a Factor of 4 (LaneLen=2): <8, 2, 12, 4, 9, 3, 13, 5>
  163. ///
  164. /// The particular case of an RE-interleave mask is:
  165. /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
  166. /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
  167. static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
  168. unsigned MaxFactor, unsigned OpNumElts) {
  169. unsigned NumElts = Mask.size();
  170. if (NumElts < 4)
  171. return false;
  172. // Check potential Factors.
  173. for (Factor = 2; Factor <= MaxFactor; Factor++) {
  174. if (NumElts % Factor)
  175. continue;
  176. unsigned LaneLen = NumElts / Factor;
  177. if (!isPowerOf2_32(LaneLen))
  178. continue;
  179. // Check whether each element matches the general interleaved rule.
  180. // Ignore undef elements, as long as the defined elements match the rule.
  181. // Outer loop processes all factors (x, y, z in the above example)
  182. unsigned I = 0, J;
  183. for (; I < Factor; I++) {
  184. unsigned SavedLaneValue;
  185. unsigned SavedNoUndefs = 0;
  186. // Inner loop processes consecutive accesses (x, x+1... in the example)
  187. for (J = 0; J < LaneLen - 1; J++) {
  188. // Lane computes x's position in the Mask
  189. unsigned Lane = J * Factor + I;
  190. unsigned NextLane = Lane + Factor;
  191. int LaneValue = Mask[Lane];
  192. int NextLaneValue = Mask[NextLane];
  193. // If both are defined, values must be sequential
  194. if (LaneValue >= 0 && NextLaneValue >= 0 &&
  195. LaneValue + 1 != NextLaneValue)
  196. break;
  197. // If the next value is undef, save the current one as reference
  198. if (LaneValue >= 0 && NextLaneValue < 0) {
  199. SavedLaneValue = LaneValue;
  200. SavedNoUndefs = 1;
  201. }
  202. // Undefs are allowed, but defined elements must still be consecutive:
  203. // i.e.: x,..., undef,..., x + 2,..., undef,..., undef,..., x + 5, ....
  204. // Verify this by storing the last non-undef followed by an undef
  205. // Check that following non-undef masks are incremented with the
  206. // corresponding distance.
  207. if (SavedNoUndefs > 0 && LaneValue < 0) {
  208. SavedNoUndefs++;
  209. if (NextLaneValue >= 0 &&
  210. SavedLaneValue + SavedNoUndefs != (unsigned)NextLaneValue)
  211. break;
  212. }
  213. }
  214. if (J < LaneLen - 1)
  215. break;
  216. int StartMask = 0;
  217. if (Mask[I] >= 0) {
  218. // Check that the start of the I range (J=0) is greater than 0
  219. StartMask = Mask[I];
  220. } else if (Mask[(LaneLen - 1) * Factor + I] >= 0) {
  221. // StartMask defined by the last value in lane
  222. StartMask = Mask[(LaneLen - 1) * Factor + I] - J;
  223. } else if (SavedNoUndefs > 0) {
  224. // StartMask defined by some non-zero value in the j loop
  225. StartMask = SavedLaneValue - (LaneLen - 1 - SavedNoUndefs);
  226. }
  227. // else StartMask remains set to 0, i.e. all elements are undefs
  228. if (StartMask < 0)
  229. break;
  230. // We must stay within the vectors; This case can happen with undefs.
  231. if (StartMask + LaneLen > OpNumElts*2)
  232. break;
  233. }
  234. // Found an interleaved mask of current factor.
  235. if (I == Factor)
  236. return true;
  237. }
  238. return false;
  239. }
  240. bool InterleavedAccess::lowerInterleavedLoad(
  241. LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
  242. if (!LI->isSimple())
  243. return false;
  244. SmallVector<ShuffleVectorInst *, 4> Shuffles;
  245. SmallVector<ExtractElementInst *, 4> Extracts;
  246. // Check if all users of this load are shufflevectors. If we encounter any
  247. // users that are extractelement instructions, we save them to later check if
  248. // they can be modifed to extract from one of the shufflevectors instead of
  249. // the load.
  250. for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
  251. auto *Extract = dyn_cast<ExtractElementInst>(*UI);
  252. if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
  253. Extracts.push_back(Extract);
  254. continue;
  255. }
  256. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
  257. if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
  258. return false;
  259. Shuffles.push_back(SVI);
  260. }
  261. if (Shuffles.empty())
  262. return false;
  263. unsigned Factor, Index;
  264. unsigned NumLoadElements = LI->getType()->getVectorNumElements();
  265. // Check if the first shufflevector is DE-interleave shuffle.
  266. if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
  267. MaxFactor, NumLoadElements))
  268. return false;
  269. // Holds the corresponding index for each DE-interleave shuffle.
  270. SmallVector<unsigned, 4> Indices;
  271. Indices.push_back(Index);
  272. Type *VecTy = Shuffles[0]->getType();
  273. // Check if other shufflevectors are also DE-interleaved of the same type
  274. // and factor as the first shufflevector.
  275. for (unsigned i = 1; i < Shuffles.size(); i++) {
  276. if (Shuffles[i]->getType() != VecTy)
  277. return false;
  278. if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
  279. Index))
  280. return false;
  281. Indices.push_back(Index);
  282. }
  283. // Try and modify users of the load that are extractelement instructions to
  284. // use the shufflevector instructions instead of the load.
  285. if (!tryReplaceExtracts(Extracts, Shuffles))
  286. return false;
  287. LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
  288. // Try to create target specific intrinsics to replace the load and shuffles.
  289. if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
  290. return false;
  291. for (auto SVI : Shuffles)
  292. DeadInsts.push_back(SVI);
  293. DeadInsts.push_back(LI);
  294. return true;
  295. }
  296. bool InterleavedAccess::tryReplaceExtracts(
  297. ArrayRef<ExtractElementInst *> Extracts,
  298. ArrayRef<ShuffleVectorInst *> Shuffles) {
  299. // If there aren't any extractelement instructions to modify, there's nothing
  300. // to do.
  301. if (Extracts.empty())
  302. return true;
  303. // Maps extractelement instructions to vector-index pairs. The extractlement
  304. // instructions will be modified to use the new vector and index operands.
  305. DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
  306. for (auto *Extract : Extracts) {
  307. // The vector index that is extracted.
  308. auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
  309. auto Index = IndexOperand->getSExtValue();
  310. // Look for a suitable shufflevector instruction. The goal is to modify the
  311. // extractelement instruction (which uses an interleaved load) to use one
  312. // of the shufflevector instructions instead of the load.
  313. for (auto *Shuffle : Shuffles) {
  314. // If the shufflevector instruction doesn't dominate the extract, we
  315. // can't create a use of it.
  316. if (!DT->dominates(Shuffle, Extract))
  317. continue;
  318. // Inspect the indices of the shufflevector instruction. If the shuffle
  319. // selects the same index that is extracted, we can modify the
  320. // extractelement instruction.
  321. SmallVector<int, 4> Indices;
  322. Shuffle->getShuffleMask(Indices);
  323. for (unsigned I = 0; I < Indices.size(); ++I)
  324. if (Indices[I] == Index) {
  325. assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
  326. "Vector operations do not match");
  327. ReplacementMap[Extract] = std::make_pair(Shuffle, I);
  328. break;
  329. }
  330. // If we found a suitable shufflevector instruction, stop looking.
  331. if (ReplacementMap.count(Extract))
  332. break;
  333. }
  334. // If we did not find a suitable shufflevector instruction, the
  335. // extractelement instruction cannot be modified, so we must give up.
  336. if (!ReplacementMap.count(Extract))
  337. return false;
  338. }
  339. // Finally, perform the replacements.
  340. IRBuilder<> Builder(Extracts[0]->getContext());
  341. for (auto &Replacement : ReplacementMap) {
  342. auto *Extract = Replacement.first;
  343. auto *Vector = Replacement.second.first;
  344. auto Index = Replacement.second.second;
  345. Builder.SetInsertPoint(Extract);
  346. Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
  347. Extract->eraseFromParent();
  348. }
  349. return true;
  350. }
  351. bool InterleavedAccess::lowerInterleavedStore(
  352. StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
  353. if (!SI->isSimple())
  354. return false;
  355. ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
  356. if (!SVI || !SVI->hasOneUse())
  357. return false;
  358. // Check if the shufflevector is RE-interleave shuffle.
  359. unsigned Factor;
  360. unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
  361. if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
  362. return false;
  363. LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
  364. // Try to create target specific intrinsics to replace the store and shuffle.
  365. if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
  366. return false;
  367. // Already have a new target specific interleaved store. Erase the old store.
  368. DeadInsts.push_back(SI);
  369. DeadInsts.push_back(SVI);
  370. return true;
  371. }
  372. bool InterleavedAccess::runOnFunction(Function &F) {
  373. auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
  374. if (!TPC || !LowerInterleavedAccesses)
  375. return false;
  376. LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
  377. DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
  378. auto &TM = TPC->getTM<TargetMachine>();
  379. TLI = TM.getSubtargetImpl(F)->getTargetLowering();
  380. MaxFactor = TLI->getMaxSupportedInterleaveFactor();
  381. // Holds dead instructions that will be erased later.
  382. SmallVector<Instruction *, 32> DeadInsts;
  383. bool Changed = false;
  384. for (auto &I : instructions(F)) {
  385. if (LoadInst *LI = dyn_cast<LoadInst>(&I))
  386. Changed |= lowerInterleavedLoad(LI, DeadInsts);
  387. if (StoreInst *SI = dyn_cast<StoreInst>(&I))
  388. Changed |= lowerInterleavedStore(SI, DeadInsts);
  389. }
  390. for (auto I : DeadInsts)
  391. I->eraseFromParent();
  392. return Changed;
  393. }