SILoadStoreOptimizer.cpp 52 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590
  1. //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass tries to fuse DS instructions with close by immediate offsets.
  10. // This will fuse operations such as
  11. // ds_read_b32 v0, v2 offset:16
  12. // ds_read_b32 v1, v2 offset:32
  13. // ==>
  14. // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
  15. //
  16. // The same is done for certain SMEM and VMEM opcodes, e.g.:
  17. // s_buffer_load_dword s4, s[0:3], 4
  18. // s_buffer_load_dword s5, s[0:3], 8
  19. // ==>
  20. // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
  21. //
  22. // This pass also tries to promote constant offset to the immediate by
  23. // adjusting the base. It tries to use a base from the nearby instructions that
  24. // allows it to have a 13bit constant offset and then promotes the 13bit offset
  25. // to the immediate.
  26. // E.g.
  27. // s_movk_i32 s0, 0x1800
  28. // v_add_co_u32_e32 v0, vcc, s0, v2
  29. // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
  30. //
  31. // s_movk_i32 s0, 0x1000
  32. // v_add_co_u32_e32 v5, vcc, s0, v2
  33. // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  34. // global_load_dwordx2 v[5:6], v[5:6], off
  35. // global_load_dwordx2 v[0:1], v[0:1], off
  36. // =>
  37. // s_movk_i32 s0, 0x1000
  38. // v_add_co_u32_e32 v5, vcc, s0, v2
  39. // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  40. // global_load_dwordx2 v[5:6], v[5:6], off
  41. // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
  42. //
  43. // Future improvements:
  44. //
  45. // - This currently relies on the scheduler to place loads and stores next to
  46. // each other, and then only merges adjacent pairs of instructions. It would
  47. // be good to be more flexible with interleaved instructions, and possibly run
  48. // before scheduling. It currently missing stores of constants because loading
  49. // the constant into the data register is placed between the stores, although
  50. // this is arguably a scheduling problem.
  51. //
  52. // - Live interval recomputing seems inefficient. This currently only matches
  53. // one pair, and recomputes live intervals and moves on to the next pair. It
  54. // would be better to compute a list of all merges that need to occur.
  55. //
  56. // - With a list of instructions to process, we can also merge more. If a
  57. // cluster of loads have offsets that are too large to fit in the 8-bit
  58. // offsets, but are close enough to fit in the 8 bits, we can add to the base
  59. // pointer and use the new reduced offsets.
  60. //
  61. //===----------------------------------------------------------------------===//
  62. #include "AMDGPU.h"
  63. #include "AMDGPUSubtarget.h"
  64. #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  65. #include "SIInstrInfo.h"
  66. #include "SIRegisterInfo.h"
  67. #include "Utils/AMDGPUBaseInfo.h"
  68. #include "llvm/ADT/ArrayRef.h"
  69. #include "llvm/ADT/SmallVector.h"
  70. #include "llvm/ADT/StringRef.h"
  71. #include "llvm/Analysis/AliasAnalysis.h"
  72. #include "llvm/CodeGen/MachineBasicBlock.h"
  73. #include "llvm/CodeGen/MachineFunction.h"
  74. #include "llvm/CodeGen/MachineFunctionPass.h"
  75. #include "llvm/CodeGen/MachineInstr.h"
  76. #include "llvm/CodeGen/MachineInstrBuilder.h"
  77. #include "llvm/CodeGen/MachineOperand.h"
  78. #include "llvm/CodeGen/MachineRegisterInfo.h"
  79. #include "llvm/IR/DebugLoc.h"
  80. #include "llvm/Pass.h"
  81. #include "llvm/Support/Debug.h"
  82. #include "llvm/Support/MathExtras.h"
  83. #include "llvm/Support/raw_ostream.h"
  84. #include <algorithm>
  85. #include <cassert>
  86. #include <cstdlib>
  87. #include <iterator>
  88. #include <utility>
  89. using namespace llvm;
  90. #define DEBUG_TYPE "si-load-store-opt"
  91. namespace {
  92. enum InstClassEnum {
  93. UNKNOWN,
  94. DS_READ,
  95. DS_WRITE,
  96. S_BUFFER_LOAD_IMM,
  97. BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
  98. BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
  99. BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
  100. BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
  101. BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
  102. BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
  103. BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
  104. BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
  105. };
  106. enum RegisterEnum {
  107. SBASE = 0x1,
  108. SRSRC = 0x2,
  109. SOFFSET = 0x4,
  110. VADDR = 0x8,
  111. ADDR = 0x10,
  112. };
  113. class SILoadStoreOptimizer : public MachineFunctionPass {
  114. struct CombineInfo {
  115. MachineBasicBlock::iterator I;
  116. MachineBasicBlock::iterator Paired;
  117. unsigned EltSize;
  118. unsigned Offset0;
  119. unsigned Offset1;
  120. unsigned Width0;
  121. unsigned Width1;
  122. unsigned BaseOff;
  123. InstClassEnum InstClass;
  124. bool GLC0;
  125. bool GLC1;
  126. bool SLC0;
  127. bool SLC1;
  128. bool DLC0;
  129. bool DLC1;
  130. bool UseST64;
  131. SmallVector<MachineInstr *, 8> InstsToMove;
  132. };
  133. struct BaseRegisters {
  134. unsigned LoReg = 0;
  135. unsigned HiReg = 0;
  136. unsigned LoSubReg = 0;
  137. unsigned HiSubReg = 0;
  138. };
  139. struct MemAddress {
  140. BaseRegisters Base;
  141. int64_t Offset = 0;
  142. };
  143. using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
  144. private:
  145. const GCNSubtarget *STM = nullptr;
  146. const SIInstrInfo *TII = nullptr;
  147. const SIRegisterInfo *TRI = nullptr;
  148. MachineRegisterInfo *MRI = nullptr;
  149. AliasAnalysis *AA = nullptr;
  150. bool OptimizeAgain;
  151. static bool offsetsCanBeCombined(CombineInfo &CI);
  152. static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
  153. static unsigned getNewOpcode(const CombineInfo &CI);
  154. static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
  155. const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
  156. unsigned getOpcodeWidth(const MachineInstr &MI) const;
  157. InstClassEnum getInstClass(unsigned Opc) const;
  158. unsigned getRegs(unsigned Opc) const;
  159. bool findMatchingInst(CombineInfo &CI);
  160. unsigned read2Opcode(unsigned EltSize) const;
  161. unsigned read2ST64Opcode(unsigned EltSize) const;
  162. MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
  163. unsigned write2Opcode(unsigned EltSize) const;
  164. unsigned write2ST64Opcode(unsigned EltSize) const;
  165. MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
  166. MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
  167. MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
  168. MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
  169. void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
  170. int32_t NewOffset) const;
  171. unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
  172. MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
  173. Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
  174. void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
  175. /// Promotes constant offset to the immediate by adjusting the base. It
  176. /// tries to use a base from the nearby instructions that allows it to have
  177. /// a 13bit constant offset which gets promoted to the immediate.
  178. bool promoteConstantOffsetToImm(MachineInstr &CI,
  179. MemInfoMap &Visited,
  180. SmallPtrSet<MachineInstr *, 4> &Promoted) const;
  181. public:
  182. static char ID;
  183. SILoadStoreOptimizer() : MachineFunctionPass(ID) {
  184. initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
  185. }
  186. bool optimizeBlock(MachineBasicBlock &MBB);
  187. bool runOnMachineFunction(MachineFunction &MF) override;
  188. StringRef getPassName() const override { return "SI Load Store Optimizer"; }
  189. void getAnalysisUsage(AnalysisUsage &AU) const override {
  190. AU.setPreservesCFG();
  191. AU.addRequired<AAResultsWrapperPass>();
  192. MachineFunctionPass::getAnalysisUsage(AU);
  193. }
  194. };
  195. } // end anonymous namespace.
  196. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
  197. "SI Load Store Optimizer", false, false)
  198. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  199. INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
  200. false, false)
  201. char SILoadStoreOptimizer::ID = 0;
  202. char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
  203. FunctionPass *llvm::createSILoadStoreOptimizerPass() {
  204. return new SILoadStoreOptimizer();
  205. }
  206. static void moveInstsAfter(MachineBasicBlock::iterator I,
  207. ArrayRef<MachineInstr *> InstsToMove) {
  208. MachineBasicBlock *MBB = I->getParent();
  209. ++I;
  210. for (MachineInstr *MI : InstsToMove) {
  211. MI->removeFromParent();
  212. MBB->insert(I, MI);
  213. }
  214. }
  215. static void addDefsUsesToList(const MachineInstr &MI,
  216. DenseSet<unsigned> &RegDefs,
  217. DenseSet<unsigned> &PhysRegUses) {
  218. for (const MachineOperand &Op : MI.operands()) {
  219. if (Op.isReg()) {
  220. if (Op.isDef())
  221. RegDefs.insert(Op.getReg());
  222. else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
  223. PhysRegUses.insert(Op.getReg());
  224. }
  225. }
  226. }
  227. static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
  228. MachineBasicBlock::iterator B,
  229. AliasAnalysis *AA) {
  230. // RAW or WAR - cannot reorder
  231. // WAW - cannot reorder
  232. // RAR - safe to reorder
  233. return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
  234. }
  235. // Add MI and its defs to the lists if MI reads one of the defs that are
  236. // already in the list. Returns true in that case.
  237. static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
  238. DenseSet<unsigned> &PhysRegUses,
  239. SmallVectorImpl<MachineInstr *> &Insts) {
  240. for (MachineOperand &Use : MI.operands()) {
  241. // If one of the defs is read, then there is a use of Def between I and the
  242. // instruction that I will potentially be merged with. We will need to move
  243. // this instruction after the merged instructions.
  244. //
  245. // Similarly, if there is a def which is read by an instruction that is to
  246. // be moved for merging, then we need to move the def-instruction as well.
  247. // This can only happen for physical registers such as M0; virtual
  248. // registers are in SSA form.
  249. if (Use.isReg() &&
  250. ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
  251. (Use.isDef() && RegDefs.count(Use.getReg())) ||
  252. (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
  253. PhysRegUses.count(Use.getReg())))) {
  254. Insts.push_back(&MI);
  255. addDefsUsesToList(MI, RegDefs, PhysRegUses);
  256. return true;
  257. }
  258. }
  259. return false;
  260. }
  261. static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
  262. ArrayRef<MachineInstr *> InstsToMove,
  263. AliasAnalysis *AA) {
  264. assert(MemOp.mayLoadOrStore());
  265. for (MachineInstr *InstToMove : InstsToMove) {
  266. if (!InstToMove->mayLoadOrStore())
  267. continue;
  268. if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
  269. return false;
  270. }
  271. return true;
  272. }
  273. // This function assumes that \p A and \p B have are identical except for
  274. // size and offset, and they referecne adjacent memory.
  275. static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
  276. const MachineMemOperand *A,
  277. const MachineMemOperand *B) {
  278. unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
  279. unsigned Size = A->getSize() + B->getSize();
  280. // This function adds the offset parameter to the existing offset for A,
  281. // so we pass 0 here as the offset and then manually set it to the correct
  282. // value after the call.
  283. MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
  284. MMO->setOffset(MinOffset);
  285. return MMO;
  286. }
  287. bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
  288. // XXX - Would the same offset be OK? Is there any reason this would happen or
  289. // be useful?
  290. if (CI.Offset0 == CI.Offset1)
  291. return false;
  292. // This won't be valid if the offset isn't aligned.
  293. if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
  294. return false;
  295. unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
  296. unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
  297. CI.UseST64 = false;
  298. CI.BaseOff = 0;
  299. // Handle SMEM and VMEM instructions.
  300. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
  301. return (EltOffset0 + CI.Width0 == EltOffset1 ||
  302. EltOffset1 + CI.Width1 == EltOffset0) &&
  303. CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
  304. (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
  305. }
  306. // If the offset in elements doesn't fit in 8-bits, we might be able to use
  307. // the stride 64 versions.
  308. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
  309. isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
  310. CI.Offset0 = EltOffset0 / 64;
  311. CI.Offset1 = EltOffset1 / 64;
  312. CI.UseST64 = true;
  313. return true;
  314. }
  315. // Check if the new offsets fit in the reduced 8-bit range.
  316. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
  317. CI.Offset0 = EltOffset0;
  318. CI.Offset1 = EltOffset1;
  319. return true;
  320. }
  321. // Try to shift base address to decrease offsets.
  322. unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
  323. CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
  324. if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
  325. CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
  326. CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
  327. CI.UseST64 = true;
  328. return true;
  329. }
  330. if (isUInt<8>(OffsetDiff)) {
  331. CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
  332. CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
  333. return true;
  334. }
  335. return false;
  336. }
  337. bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
  338. const CombineInfo &CI) {
  339. const unsigned Width = (CI.Width0 + CI.Width1);
  340. switch (CI.InstClass) {
  341. default:
  342. return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
  343. case S_BUFFER_LOAD_IMM:
  344. switch (Width) {
  345. default:
  346. return false;
  347. case 2:
  348. case 4:
  349. return true;
  350. }
  351. }
  352. }
  353. unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) const {
  354. const unsigned Opc = MI.getOpcode();
  355. if (TII->isMUBUF(MI)) {
  356. // FIXME: Handle d16 correctly
  357. return AMDGPU::getMUBUFElements(Opc);
  358. }
  359. switch (Opc) {
  360. default:
  361. return 0;
  362. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  363. return 1;
  364. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  365. return 2;
  366. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  367. return 4;
  368. }
  369. }
  370. InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) const {
  371. if (TII->isMUBUF(Opc)) {
  372. const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
  373. // If we couldn't identify the opcode, bail out.
  374. if (baseOpcode == -1) {
  375. return UNKNOWN;
  376. }
  377. switch (baseOpcode) {
  378. default:
  379. return UNKNOWN;
  380. case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
  381. return BUFFER_LOAD_OFFEN;
  382. case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
  383. return BUFFER_LOAD_OFFSET;
  384. case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
  385. return BUFFER_STORE_OFFEN;
  386. case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
  387. return BUFFER_STORE_OFFSET;
  388. case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
  389. return BUFFER_LOAD_OFFEN_exact;
  390. case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
  391. return BUFFER_LOAD_OFFSET_exact;
  392. case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
  393. return BUFFER_STORE_OFFEN_exact;
  394. case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
  395. return BUFFER_STORE_OFFSET_exact;
  396. }
  397. }
  398. switch (Opc) {
  399. default:
  400. return UNKNOWN;
  401. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  402. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  403. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  404. return S_BUFFER_LOAD_IMM;
  405. case AMDGPU::DS_READ_B32:
  406. case AMDGPU::DS_READ_B64:
  407. case AMDGPU::DS_READ_B32_gfx9:
  408. case AMDGPU::DS_READ_B64_gfx9:
  409. return DS_READ;
  410. case AMDGPU::DS_WRITE_B32:
  411. case AMDGPU::DS_WRITE_B64:
  412. case AMDGPU::DS_WRITE_B32_gfx9:
  413. case AMDGPU::DS_WRITE_B64_gfx9:
  414. return DS_WRITE;
  415. }
  416. }
  417. unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) const {
  418. if (TII->isMUBUF(Opc)) {
  419. unsigned result = 0;
  420. if (AMDGPU::getMUBUFHasVAddr(Opc)) {
  421. result |= VADDR;
  422. }
  423. if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
  424. result |= SRSRC;
  425. }
  426. if (AMDGPU::getMUBUFHasSoffset(Opc)) {
  427. result |= SOFFSET;
  428. }
  429. return result;
  430. }
  431. switch (Opc) {
  432. default:
  433. return 0;
  434. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  435. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  436. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  437. return SBASE;
  438. case AMDGPU::DS_READ_B32:
  439. case AMDGPU::DS_READ_B64:
  440. case AMDGPU::DS_READ_B32_gfx9:
  441. case AMDGPU::DS_READ_B64_gfx9:
  442. case AMDGPU::DS_WRITE_B32:
  443. case AMDGPU::DS_WRITE_B64:
  444. case AMDGPU::DS_WRITE_B32_gfx9:
  445. case AMDGPU::DS_WRITE_B64_gfx9:
  446. return ADDR;
  447. }
  448. }
  449. bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
  450. MachineBasicBlock *MBB = CI.I->getParent();
  451. MachineBasicBlock::iterator E = MBB->end();
  452. MachineBasicBlock::iterator MBBI = CI.I;
  453. const unsigned Opc = CI.I->getOpcode();
  454. const InstClassEnum InstClass = getInstClass(Opc);
  455. if (InstClass == UNKNOWN) {
  456. return false;
  457. }
  458. const unsigned Regs = getRegs(Opc);
  459. unsigned AddrOpName[5] = {0};
  460. int AddrIdx[5];
  461. const MachineOperand *AddrReg[5];
  462. unsigned NumAddresses = 0;
  463. if (Regs & ADDR) {
  464. AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
  465. }
  466. if (Regs & SBASE) {
  467. AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
  468. }
  469. if (Regs & SRSRC) {
  470. AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
  471. }
  472. if (Regs & SOFFSET) {
  473. AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
  474. }
  475. if (Regs & VADDR) {
  476. AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
  477. }
  478. for (unsigned i = 0; i < NumAddresses; i++) {
  479. AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
  480. AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
  481. // We only ever merge operations with the same base address register, so
  482. // don't bother scanning forward if there are no other uses.
  483. if (AddrReg[i]->isReg() &&
  484. (Register::isPhysicalRegister(AddrReg[i]->getReg()) ||
  485. MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
  486. return false;
  487. }
  488. ++MBBI;
  489. DenseSet<unsigned> RegDefsToMove;
  490. DenseSet<unsigned> PhysRegUsesToMove;
  491. addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
  492. for (; MBBI != E; ++MBBI) {
  493. const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
  494. if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
  495. (IsDS && (MBBI->getOpcode() != Opc))) {
  496. // This is not a matching DS instruction, but we can keep looking as
  497. // long as one of these conditions are met:
  498. // 1. It is safe to move I down past MBBI.
  499. // 2. It is safe to move MBBI down past the instruction that I will
  500. // be merged into.
  501. if (MBBI->hasUnmodeledSideEffects()) {
  502. // We can't re-order this instruction with respect to other memory
  503. // operations, so we fail both conditions mentioned above.
  504. return false;
  505. }
  506. if (MBBI->mayLoadOrStore() &&
  507. (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
  508. !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
  509. // We fail condition #1, but we may still be able to satisfy condition
  510. // #2. Add this instruction to the move list and then we will check
  511. // if condition #2 holds once we have selected the matching instruction.
  512. CI.InstsToMove.push_back(&*MBBI);
  513. addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
  514. continue;
  515. }
  516. // When we match I with another DS instruction we will be moving I down
  517. // to the location of the matched instruction any uses of I will need to
  518. // be moved down as well.
  519. addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
  520. CI.InstsToMove);
  521. continue;
  522. }
  523. // Don't merge volatiles.
  524. if (MBBI->hasOrderedMemoryRef())
  525. return false;
  526. // Handle a case like
  527. // DS_WRITE_B32 addr, v, idx0
  528. // w = DS_READ_B32 addr, idx0
  529. // DS_WRITE_B32 addr, f(w), idx1
  530. // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
  531. // merging of the two writes.
  532. if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
  533. CI.InstsToMove))
  534. continue;
  535. bool Match = true;
  536. for (unsigned i = 0; i < NumAddresses; i++) {
  537. const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
  538. if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
  539. if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
  540. AddrReg[i]->getImm() != AddrRegNext.getImm()) {
  541. Match = false;
  542. break;
  543. }
  544. continue;
  545. }
  546. // Check same base pointer. Be careful of subregisters, which can occur
  547. // with vectors of pointers.
  548. if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
  549. AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
  550. Match = false;
  551. break;
  552. }
  553. }
  554. if (Match) {
  555. int OffsetIdx =
  556. AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
  557. CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
  558. CI.Width0 = getOpcodeWidth(*CI.I);
  559. CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
  560. CI.Width1 = getOpcodeWidth(*MBBI);
  561. CI.Paired = MBBI;
  562. if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
  563. CI.Offset0 &= 0xffff;
  564. CI.Offset1 &= 0xffff;
  565. } else {
  566. CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
  567. CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
  568. if (CI.InstClass != S_BUFFER_LOAD_IMM) {
  569. CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
  570. CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
  571. }
  572. CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
  573. CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
  574. }
  575. // Check both offsets fit in the reduced range.
  576. // We also need to go through the list of instructions that we plan to
  577. // move and make sure they are all safe to move down past the merged
  578. // instruction.
  579. if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
  580. if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
  581. return true;
  582. }
  583. // We've found a load/store that we couldn't merge for some reason.
  584. // We could potentially keep looking, but we'd need to make sure that
  585. // it was safe to move I and also all the instruction in InstsToMove
  586. // down past this instruction.
  587. // check if we can move I across MBBI and if we can move all I's users
  588. if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
  589. !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
  590. break;
  591. }
  592. return false;
  593. }
  594. unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
  595. if (STM->ldsRequiresM0Init())
  596. return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
  597. return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
  598. }
  599. unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
  600. if (STM->ldsRequiresM0Init())
  601. return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
  602. return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
  603. : AMDGPU::DS_READ2ST64_B64_gfx9;
  604. }
  605. MachineBasicBlock::iterator
  606. SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
  607. MachineBasicBlock *MBB = CI.I->getParent();
  608. // Be careful, since the addresses could be subregisters themselves in weird
  609. // cases, like vectors of pointers.
  610. const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
  611. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
  612. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
  613. unsigned NewOffset0 = CI.Offset0;
  614. unsigned NewOffset1 = CI.Offset1;
  615. unsigned Opc =
  616. CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
  617. unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
  618. unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
  619. if (NewOffset0 > NewOffset1) {
  620. // Canonicalize the merged instruction so the smaller offset comes first.
  621. std::swap(NewOffset0, NewOffset1);
  622. std::swap(SubRegIdx0, SubRegIdx1);
  623. }
  624. assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
  625. (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
  626. const MCInstrDesc &Read2Desc = TII->get(Opc);
  627. const TargetRegisterClass *SuperRC =
  628. (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
  629. Register DestReg = MRI->createVirtualRegister(SuperRC);
  630. DebugLoc DL = CI.I->getDebugLoc();
  631. Register BaseReg = AddrReg->getReg();
  632. unsigned BaseSubReg = AddrReg->getSubReg();
  633. unsigned BaseRegFlags = 0;
  634. if (CI.BaseOff) {
  635. Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
  636. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
  637. .addImm(CI.BaseOff);
  638. BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  639. BaseRegFlags = RegState::Kill;
  640. TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
  641. .addReg(ImmReg)
  642. .addReg(AddrReg->getReg(), 0, BaseSubReg)
  643. .addImm(0); // clamp bit
  644. BaseSubReg = 0;
  645. }
  646. MachineInstrBuilder Read2 =
  647. BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
  648. .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
  649. .addImm(NewOffset0) // offset0
  650. .addImm(NewOffset1) // offset1
  651. .addImm(0) // gds
  652. .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
  653. (void)Read2;
  654. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  655. // Copy to the old destination registers.
  656. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  657. .add(*Dest0) // Copy to same destination including flags and sub reg.
  658. .addReg(DestReg, 0, SubRegIdx0);
  659. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  660. .add(*Dest1)
  661. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  662. moveInstsAfter(Copy1, CI.InstsToMove);
  663. MachineBasicBlock::iterator Next = std::next(CI.I);
  664. CI.I->eraseFromParent();
  665. CI.Paired->eraseFromParent();
  666. LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
  667. return Next;
  668. }
  669. unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
  670. if (STM->ldsRequiresM0Init())
  671. return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
  672. return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
  673. : AMDGPU::DS_WRITE2_B64_gfx9;
  674. }
  675. unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
  676. if (STM->ldsRequiresM0Init())
  677. return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
  678. : AMDGPU::DS_WRITE2ST64_B64;
  679. return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
  680. : AMDGPU::DS_WRITE2ST64_B64_gfx9;
  681. }
  682. MachineBasicBlock::iterator
  683. SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
  684. MachineBasicBlock *MBB = CI.I->getParent();
  685. // Be sure to use .addOperand(), and not .addReg() with these. We want to be
  686. // sure we preserve the subregister index and any register flags set on them.
  687. const MachineOperand *AddrReg =
  688. TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
  689. const MachineOperand *Data0 =
  690. TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
  691. const MachineOperand *Data1 =
  692. TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
  693. unsigned NewOffset0 = CI.Offset0;
  694. unsigned NewOffset1 = CI.Offset1;
  695. unsigned Opc =
  696. CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
  697. if (NewOffset0 > NewOffset1) {
  698. // Canonicalize the merged instruction so the smaller offset comes first.
  699. std::swap(NewOffset0, NewOffset1);
  700. std::swap(Data0, Data1);
  701. }
  702. assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
  703. (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
  704. const MCInstrDesc &Write2Desc = TII->get(Opc);
  705. DebugLoc DL = CI.I->getDebugLoc();
  706. Register BaseReg = AddrReg->getReg();
  707. unsigned BaseSubReg = AddrReg->getSubReg();
  708. unsigned BaseRegFlags = 0;
  709. if (CI.BaseOff) {
  710. Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
  711. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
  712. .addImm(CI.BaseOff);
  713. BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  714. BaseRegFlags = RegState::Kill;
  715. TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
  716. .addReg(ImmReg)
  717. .addReg(AddrReg->getReg(), 0, BaseSubReg)
  718. .addImm(0); // clamp bit
  719. BaseSubReg = 0;
  720. }
  721. MachineInstrBuilder Write2 =
  722. BuildMI(*MBB, CI.Paired, DL, Write2Desc)
  723. .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
  724. .add(*Data0) // data0
  725. .add(*Data1) // data1
  726. .addImm(NewOffset0) // offset0
  727. .addImm(NewOffset1) // offset1
  728. .addImm(0) // gds
  729. .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
  730. moveInstsAfter(Write2, CI.InstsToMove);
  731. MachineBasicBlock::iterator Next = std::next(CI.I);
  732. CI.I->eraseFromParent();
  733. CI.Paired->eraseFromParent();
  734. LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
  735. return Next;
  736. }
  737. MachineBasicBlock::iterator
  738. SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
  739. MachineBasicBlock *MBB = CI.I->getParent();
  740. DebugLoc DL = CI.I->getDebugLoc();
  741. const unsigned Opcode = getNewOpcode(CI);
  742. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  743. Register DestReg = MRI->createVirtualRegister(SuperRC);
  744. unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
  745. // It shouldn't be possible to get this far if the two instructions
  746. // don't have a single memoperand, because MachineInstr::mayAlias()
  747. // will return true if this is the case.
  748. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  749. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  750. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  751. BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
  752. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
  753. .addImm(MergedOffset) // offset
  754. .addImm(CI.GLC0) // glc
  755. .addImm(CI.DLC0) // dlc
  756. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  757. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  758. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  759. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  760. // Copy to the old destination registers.
  761. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  762. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
  763. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
  764. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  765. .add(*Dest0) // Copy to same destination including flags and sub reg.
  766. .addReg(DestReg, 0, SubRegIdx0);
  767. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  768. .add(*Dest1)
  769. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  770. moveInstsAfter(Copy1, CI.InstsToMove);
  771. MachineBasicBlock::iterator Next = std::next(CI.I);
  772. CI.I->eraseFromParent();
  773. CI.Paired->eraseFromParent();
  774. return Next;
  775. }
  776. MachineBasicBlock::iterator
  777. SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
  778. MachineBasicBlock *MBB = CI.I->getParent();
  779. DebugLoc DL = CI.I->getDebugLoc();
  780. const unsigned Opcode = getNewOpcode(CI);
  781. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  782. // Copy to the new source register.
  783. Register DestReg = MRI->createVirtualRegister(SuperRC);
  784. unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
  785. auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
  786. const unsigned Regs = getRegs(Opcode);
  787. if (Regs & VADDR)
  788. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
  789. // It shouldn't be possible to get this far if the two instructions
  790. // don't have a single memoperand, because MachineInstr::mayAlias()
  791. // will return true if this is the case.
  792. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  793. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  794. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  795. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
  796. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
  797. .addImm(MergedOffset) // offset
  798. .addImm(CI.GLC0) // glc
  799. .addImm(CI.SLC0) // slc
  800. .addImm(0) // tfe
  801. .addImm(CI.DLC0) // dlc
  802. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  803. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  804. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  805. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  806. // Copy to the old destination registers.
  807. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  808. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
  809. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
  810. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  811. .add(*Dest0) // Copy to same destination including flags and sub reg.
  812. .addReg(DestReg, 0, SubRegIdx0);
  813. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  814. .add(*Dest1)
  815. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  816. moveInstsAfter(Copy1, CI.InstsToMove);
  817. MachineBasicBlock::iterator Next = std::next(CI.I);
  818. CI.I->eraseFromParent();
  819. CI.Paired->eraseFromParent();
  820. return Next;
  821. }
  822. unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
  823. const unsigned Width = CI.Width0 + CI.Width1;
  824. switch (CI.InstClass) {
  825. default:
  826. // FIXME: Handle d16 correctly
  827. return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
  828. case UNKNOWN:
  829. llvm_unreachable("Unknown instruction class");
  830. case S_BUFFER_LOAD_IMM:
  831. switch (Width) {
  832. default:
  833. return 0;
  834. case 2:
  835. return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
  836. case 4:
  837. return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
  838. }
  839. }
  840. }
  841. std::pair<unsigned, unsigned>
  842. SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
  843. if (CI.Offset0 > CI.Offset1) {
  844. switch (CI.Width0) {
  845. default:
  846. return std::make_pair(0, 0);
  847. case 1:
  848. switch (CI.Width1) {
  849. default:
  850. return std::make_pair(0, 0);
  851. case 1:
  852. return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
  853. case 2:
  854. return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
  855. case 3:
  856. return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
  857. }
  858. case 2:
  859. switch (CI.Width1) {
  860. default:
  861. return std::make_pair(0, 0);
  862. case 1:
  863. return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
  864. case 2:
  865. return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
  866. }
  867. case 3:
  868. switch (CI.Width1) {
  869. default:
  870. return std::make_pair(0, 0);
  871. case 1:
  872. return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
  873. }
  874. }
  875. } else {
  876. switch (CI.Width0) {
  877. default:
  878. return std::make_pair(0, 0);
  879. case 1:
  880. switch (CI.Width1) {
  881. default:
  882. return std::make_pair(0, 0);
  883. case 1:
  884. return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
  885. case 2:
  886. return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
  887. case 3:
  888. return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
  889. }
  890. case 2:
  891. switch (CI.Width1) {
  892. default:
  893. return std::make_pair(0, 0);
  894. case 1:
  895. return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
  896. case 2:
  897. return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
  898. }
  899. case 3:
  900. switch (CI.Width1) {
  901. default:
  902. return std::make_pair(0, 0);
  903. case 1:
  904. return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
  905. }
  906. }
  907. }
  908. }
  909. const TargetRegisterClass *
  910. SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
  911. if (CI.InstClass == S_BUFFER_LOAD_IMM) {
  912. switch (CI.Width0 + CI.Width1) {
  913. default:
  914. return nullptr;
  915. case 2:
  916. return &AMDGPU::SReg_64_XEXECRegClass;
  917. case 4:
  918. return &AMDGPU::SReg_128RegClass;
  919. case 8:
  920. return &AMDGPU::SReg_256RegClass;
  921. case 16:
  922. return &AMDGPU::SReg_512RegClass;
  923. }
  924. } else {
  925. switch (CI.Width0 + CI.Width1) {
  926. default:
  927. return nullptr;
  928. case 2:
  929. return &AMDGPU::VReg_64RegClass;
  930. case 3:
  931. return &AMDGPU::VReg_96RegClass;
  932. case 4:
  933. return &AMDGPU::VReg_128RegClass;
  934. }
  935. }
  936. }
  937. MachineBasicBlock::iterator
  938. SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
  939. MachineBasicBlock *MBB = CI.I->getParent();
  940. DebugLoc DL = CI.I->getDebugLoc();
  941. const unsigned Opcode = getNewOpcode(CI);
  942. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  943. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  944. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  945. // Copy to the new source register.
  946. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  947. Register SrcReg = MRI->createVirtualRegister(SuperRC);
  948. const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
  949. const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
  950. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
  951. .add(*Src0)
  952. .addImm(SubRegIdx0)
  953. .add(*Src1)
  954. .addImm(SubRegIdx1);
  955. auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
  956. .addReg(SrcReg, RegState::Kill);
  957. const unsigned Regs = getRegs(Opcode);
  958. if (Regs & VADDR)
  959. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
  960. // It shouldn't be possible to get this far if the two instructions
  961. // don't have a single memoperand, because MachineInstr::mayAlias()
  962. // will return true if this is the case.
  963. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  964. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  965. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  966. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
  967. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
  968. .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
  969. .addImm(CI.GLC0) // glc
  970. .addImm(CI.SLC0) // slc
  971. .addImm(0) // tfe
  972. .addImm(CI.DLC0) // dlc
  973. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  974. moveInstsAfter(MIB, CI.InstsToMove);
  975. MachineBasicBlock::iterator Next = std::next(CI.I);
  976. CI.I->eraseFromParent();
  977. CI.Paired->eraseFromParent();
  978. return Next;
  979. }
  980. MachineOperand
  981. SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
  982. APInt V(32, Val, true);
  983. if (TII->isInlineConstant(V))
  984. return MachineOperand::CreateImm(Val);
  985. Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  986. MachineInstr *Mov =
  987. BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
  988. TII->get(AMDGPU::S_MOV_B32), Reg)
  989. .addImm(Val);
  990. (void)Mov;
  991. LLVM_DEBUG(dbgs() << " "; Mov->dump());
  992. return MachineOperand::CreateReg(Reg, false);
  993. }
  994. // Compute base address using Addr and return the final register.
  995. unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
  996. const MemAddress &Addr) const {
  997. MachineBasicBlock *MBB = MI.getParent();
  998. MachineBasicBlock::iterator MBBI = MI.getIterator();
  999. DebugLoc DL = MI.getDebugLoc();
  1000. assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
  1001. Addr.Base.LoSubReg) &&
  1002. "Expected 32-bit Base-Register-Low!!");
  1003. assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
  1004. Addr.Base.HiSubReg) &&
  1005. "Expected 32-bit Base-Register-Hi!!");
  1006. LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
  1007. MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
  1008. MachineOperand OffsetHi =
  1009. createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
  1010. const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
  1011. Register CarryReg = MRI->createVirtualRegister(CarryRC);
  1012. Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
  1013. Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  1014. Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  1015. MachineInstr *LoHalf =
  1016. BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
  1017. .addReg(CarryReg, RegState::Define)
  1018. .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
  1019. .add(OffsetLo)
  1020. .addImm(0); // clamp bit
  1021. (void)LoHalf;
  1022. LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
  1023. MachineInstr *HiHalf =
  1024. BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
  1025. .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
  1026. .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
  1027. .add(OffsetHi)
  1028. .addReg(CarryReg, RegState::Kill)
  1029. .addImm(0); // clamp bit
  1030. (void)HiHalf;
  1031. LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
  1032. Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
  1033. MachineInstr *FullBase =
  1034. BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
  1035. .addReg(DestSub0)
  1036. .addImm(AMDGPU::sub0)
  1037. .addReg(DestSub1)
  1038. .addImm(AMDGPU::sub1);
  1039. (void)FullBase;
  1040. LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
  1041. return FullDestReg;
  1042. }
  1043. // Update base and offset with the NewBase and NewOffset in MI.
  1044. void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
  1045. unsigned NewBase,
  1046. int32_t NewOffset) const {
  1047. TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
  1048. TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
  1049. }
  1050. Optional<int32_t>
  1051. SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
  1052. if (Op.isImm())
  1053. return Op.getImm();
  1054. if (!Op.isReg())
  1055. return None;
  1056. MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
  1057. if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
  1058. !Def->getOperand(1).isImm())
  1059. return None;
  1060. return Def->getOperand(1).getImm();
  1061. }
  1062. // Analyze Base and extracts:
  1063. // - 32bit base registers, subregisters
  1064. // - 64bit constant offset
  1065. // Expecting base computation as:
  1066. // %OFFSET0:sgpr_32 = S_MOV_B32 8000
  1067. // %LO:vgpr_32, %c:sreg_64_xexec =
  1068. // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
  1069. // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
  1070. // %Base:vreg_64 =
  1071. // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
  1072. void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
  1073. MemAddress &Addr) const {
  1074. if (!Base.isReg())
  1075. return;
  1076. MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
  1077. if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
  1078. || Def->getNumOperands() != 5)
  1079. return;
  1080. MachineOperand BaseLo = Def->getOperand(1);
  1081. MachineOperand BaseHi = Def->getOperand(3);
  1082. if (!BaseLo.isReg() || !BaseHi.isReg())
  1083. return;
  1084. MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
  1085. MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
  1086. if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
  1087. !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
  1088. return;
  1089. const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
  1090. const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
  1091. auto Offset0P = extractConstOffset(*Src0);
  1092. if (Offset0P)
  1093. BaseLo = *Src1;
  1094. else {
  1095. if (!(Offset0P = extractConstOffset(*Src1)))
  1096. return;
  1097. BaseLo = *Src0;
  1098. }
  1099. Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
  1100. Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
  1101. if (Src0->isImm())
  1102. std::swap(Src0, Src1);
  1103. if (!Src1->isImm())
  1104. return;
  1105. uint64_t Offset1 = Src1->getImm();
  1106. BaseHi = *Src0;
  1107. Addr.Base.LoReg = BaseLo.getReg();
  1108. Addr.Base.HiReg = BaseHi.getReg();
  1109. Addr.Base.LoSubReg = BaseLo.getSubReg();
  1110. Addr.Base.HiSubReg = BaseHi.getSubReg();
  1111. Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
  1112. }
  1113. bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
  1114. MachineInstr &MI,
  1115. MemInfoMap &Visited,
  1116. SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
  1117. if (!(MI.mayLoad() ^ MI.mayStore()))
  1118. return false;
  1119. // TODO: Support flat and scratch.
  1120. if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
  1121. return false;
  1122. if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
  1123. return false;
  1124. if (AnchorList.count(&MI))
  1125. return false;
  1126. LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
  1127. if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
  1128. LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
  1129. return false;
  1130. }
  1131. // Step1: Find the base-registers and a 64bit constant offset.
  1132. MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
  1133. MemAddress MAddr;
  1134. if (Visited.find(&MI) == Visited.end()) {
  1135. processBaseWithConstOffset(Base, MAddr);
  1136. Visited[&MI] = MAddr;
  1137. } else
  1138. MAddr = Visited[&MI];
  1139. if (MAddr.Offset == 0) {
  1140. LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
  1141. " constant offsets that can be promoted.\n";);
  1142. return false;
  1143. }
  1144. LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
  1145. << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
  1146. // Step2: Traverse through MI's basic block and find an anchor(that has the
  1147. // same base-registers) with the highest 13bit distance from MI's offset.
  1148. // E.g. (64bit loads)
  1149. // bb:
  1150. // addr1 = &a + 4096; load1 = load(addr1, 0)
  1151. // addr2 = &a + 6144; load2 = load(addr2, 0)
  1152. // addr3 = &a + 8192; load3 = load(addr3, 0)
  1153. // addr4 = &a + 10240; load4 = load(addr4, 0)
  1154. // addr5 = &a + 12288; load5 = load(addr5, 0)
  1155. //
  1156. // Starting from the first load, the optimization will try to find a new base
  1157. // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
  1158. // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
  1159. // as the new-base(anchor) because of the maximum distance which can
  1160. // accomodate more intermediate bases presumeably.
  1161. //
  1162. // Step3: move (&a + 8192) above load1. Compute and promote offsets from
  1163. // (&a + 8192) for load1, load2, load4.
  1164. // addr = &a + 8192
  1165. // load1 = load(addr, -4096)
  1166. // load2 = load(addr, -2048)
  1167. // load3 = load(addr, 0)
  1168. // load4 = load(addr, 2048)
  1169. // addr5 = &a + 12288; load5 = load(addr5, 0)
  1170. //
  1171. MachineInstr *AnchorInst = nullptr;
  1172. MemAddress AnchorAddr;
  1173. uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
  1174. SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
  1175. MachineBasicBlock *MBB = MI.getParent();
  1176. MachineBasicBlock::iterator E = MBB->end();
  1177. MachineBasicBlock::iterator MBBI = MI.getIterator();
  1178. ++MBBI;
  1179. const SITargetLowering *TLI =
  1180. static_cast<const SITargetLowering *>(STM->getTargetLowering());
  1181. for ( ; MBBI != E; ++MBBI) {
  1182. MachineInstr &MINext = *MBBI;
  1183. // TODO: Support finding an anchor(with same base) from store addresses or
  1184. // any other load addresses where the opcodes are different.
  1185. if (MINext.getOpcode() != MI.getOpcode() ||
  1186. TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
  1187. continue;
  1188. const MachineOperand &BaseNext =
  1189. *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
  1190. MemAddress MAddrNext;
  1191. if (Visited.find(&MINext) == Visited.end()) {
  1192. processBaseWithConstOffset(BaseNext, MAddrNext);
  1193. Visited[&MINext] = MAddrNext;
  1194. } else
  1195. MAddrNext = Visited[&MINext];
  1196. if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
  1197. MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
  1198. MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
  1199. MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
  1200. continue;
  1201. InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
  1202. int64_t Dist = MAddr.Offset - MAddrNext.Offset;
  1203. TargetLoweringBase::AddrMode AM;
  1204. AM.HasBaseReg = true;
  1205. AM.BaseOffs = Dist;
  1206. if (TLI->isLegalGlobalAddressingMode(AM) &&
  1207. (uint32_t)std::abs(Dist) > MaxDist) {
  1208. MaxDist = std::abs(Dist);
  1209. AnchorAddr = MAddrNext;
  1210. AnchorInst = &MINext;
  1211. }
  1212. }
  1213. if (AnchorInst) {
  1214. LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
  1215. AnchorInst->dump());
  1216. LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
  1217. << AnchorAddr.Offset << "\n\n");
  1218. // Instead of moving up, just re-compute anchor-instruction's base address.
  1219. unsigned Base = computeBase(MI, AnchorAddr);
  1220. updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
  1221. LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
  1222. for (auto P : InstsWCommonBase) {
  1223. TargetLoweringBase::AddrMode AM;
  1224. AM.HasBaseReg = true;
  1225. AM.BaseOffs = P.second - AnchorAddr.Offset;
  1226. if (TLI->isLegalGlobalAddressingMode(AM)) {
  1227. LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
  1228. dbgs() << ")"; P.first->dump());
  1229. updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
  1230. LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
  1231. }
  1232. }
  1233. AnchorList.insert(AnchorInst);
  1234. return true;
  1235. }
  1236. return false;
  1237. }
  1238. // Scan through looking for adjacent LDS operations with constant offsets from
  1239. // the same base register. We rely on the scheduler to do the hard work of
  1240. // clustering nearby loads, and assume these are all adjacent.
  1241. bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
  1242. bool Modified = false;
  1243. // Contain the list
  1244. MemInfoMap Visited;
  1245. // Contains the list of instructions for which constant offsets are being
  1246. // promoted to the IMM.
  1247. SmallPtrSet<MachineInstr *, 4> AnchorList;
  1248. for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
  1249. MachineInstr &MI = *I;
  1250. if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
  1251. Modified = true;
  1252. // Don't combine if volatile.
  1253. if (MI.hasOrderedMemoryRef()) {
  1254. ++I;
  1255. continue;
  1256. }
  1257. const unsigned Opc = MI.getOpcode();
  1258. CombineInfo CI;
  1259. CI.I = I;
  1260. CI.InstClass = getInstClass(Opc);
  1261. switch (CI.InstClass) {
  1262. default:
  1263. break;
  1264. case DS_READ:
  1265. CI.EltSize =
  1266. (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
  1267. : 4;
  1268. if (findMatchingInst(CI)) {
  1269. Modified = true;
  1270. I = mergeRead2Pair(CI);
  1271. } else {
  1272. ++I;
  1273. }
  1274. continue;
  1275. case DS_WRITE:
  1276. CI.EltSize =
  1277. (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
  1278. : 4;
  1279. if (findMatchingInst(CI)) {
  1280. Modified = true;
  1281. I = mergeWrite2Pair(CI);
  1282. } else {
  1283. ++I;
  1284. }
  1285. continue;
  1286. case S_BUFFER_LOAD_IMM:
  1287. CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
  1288. if (findMatchingInst(CI)) {
  1289. Modified = true;
  1290. I = mergeSBufferLoadImmPair(CI);
  1291. OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
  1292. } else {
  1293. ++I;
  1294. }
  1295. continue;
  1296. case BUFFER_LOAD_OFFEN:
  1297. case BUFFER_LOAD_OFFSET:
  1298. case BUFFER_LOAD_OFFEN_exact:
  1299. case BUFFER_LOAD_OFFSET_exact:
  1300. CI.EltSize = 4;
  1301. if (findMatchingInst(CI)) {
  1302. Modified = true;
  1303. I = mergeBufferLoadPair(CI);
  1304. OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
  1305. } else {
  1306. ++I;
  1307. }
  1308. continue;
  1309. case BUFFER_STORE_OFFEN:
  1310. case BUFFER_STORE_OFFSET:
  1311. case BUFFER_STORE_OFFEN_exact:
  1312. case BUFFER_STORE_OFFSET_exact:
  1313. CI.EltSize = 4;
  1314. if (findMatchingInst(CI)) {
  1315. Modified = true;
  1316. I = mergeBufferStorePair(CI);
  1317. OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
  1318. } else {
  1319. ++I;
  1320. }
  1321. continue;
  1322. }
  1323. ++I;
  1324. }
  1325. return Modified;
  1326. }
  1327. bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
  1328. if (skipFunction(MF.getFunction()))
  1329. return false;
  1330. STM = &MF.getSubtarget<GCNSubtarget>();
  1331. if (!STM->loadStoreOptEnabled())
  1332. return false;
  1333. TII = STM->getInstrInfo();
  1334. TRI = &TII->getRegisterInfo();
  1335. MRI = &MF.getRegInfo();
  1336. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  1337. assert(MRI->isSSA() && "Must be run on SSA");
  1338. LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
  1339. bool Modified = false;
  1340. for (MachineBasicBlock &MBB : MF) {
  1341. do {
  1342. OptimizeAgain = false;
  1343. Modified |= optimizeBlock(MBB);
  1344. } while (OptimizeAgain);
  1345. }
  1346. return Modified;
  1347. }