SILoadStoreOptimizer.cpp 56 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700
  1. //===- SILoadStoreOptimizer.cpp -------------------------------------------===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This pass tries to fuse DS instructions with close by immediate offsets.
  10. // This will fuse operations such as
  11. // ds_read_b32 v0, v2 offset:16
  12. // ds_read_b32 v1, v2 offset:32
  13. // ==>
  14. // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
  15. //
  16. // The same is done for certain SMEM and VMEM opcodes, e.g.:
  17. // s_buffer_load_dword s4, s[0:3], 4
  18. // s_buffer_load_dword s5, s[0:3], 8
  19. // ==>
  20. // s_buffer_load_dwordx2 s[4:5], s[0:3], 4
  21. //
  22. // This pass also tries to promote constant offset to the immediate by
  23. // adjusting the base. It tries to use a base from the nearby instructions that
  24. // allows it to have a 13bit constant offset and then promotes the 13bit offset
  25. // to the immediate.
  26. // E.g.
  27. // s_movk_i32 s0, 0x1800
  28. // v_add_co_u32_e32 v0, vcc, s0, v2
  29. // v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
  30. //
  31. // s_movk_i32 s0, 0x1000
  32. // v_add_co_u32_e32 v5, vcc, s0, v2
  33. // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  34. // global_load_dwordx2 v[5:6], v[5:6], off
  35. // global_load_dwordx2 v[0:1], v[0:1], off
  36. // =>
  37. // s_movk_i32 s0, 0x1000
  38. // v_add_co_u32_e32 v5, vcc, s0, v2
  39. // v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
  40. // global_load_dwordx2 v[5:6], v[5:6], off
  41. // global_load_dwordx2 v[0:1], v[5:6], off offset:2048
  42. //
  43. // Future improvements:
  44. //
  45. // - This is currently missing stores of constants because loading
  46. // the constant into the data register is placed between the stores, although
  47. // this is arguably a scheduling problem.
  48. //
  49. // - Live interval recomputing seems inefficient. This currently only matches
  50. // one pair, and recomputes live intervals and moves on to the next pair. It
  51. // would be better to compute a list of all merges that need to occur.
  52. //
  53. // - With a list of instructions to process, we can also merge more. If a
  54. // cluster of loads have offsets that are too large to fit in the 8-bit
  55. // offsets, but are close enough to fit in the 8 bits, we can add to the base
  56. // pointer and use the new reduced offsets.
  57. //
  58. //===----------------------------------------------------------------------===//
  59. #include "AMDGPU.h"
  60. #include "AMDGPUSubtarget.h"
  61. #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
  62. #include "SIInstrInfo.h"
  63. #include "SIRegisterInfo.h"
  64. #include "Utils/AMDGPUBaseInfo.h"
  65. #include "llvm/ADT/ArrayRef.h"
  66. #include "llvm/ADT/SmallVector.h"
  67. #include "llvm/ADT/StringRef.h"
  68. #include "llvm/Analysis/AliasAnalysis.h"
  69. #include "llvm/CodeGen/MachineBasicBlock.h"
  70. #include "llvm/CodeGen/MachineFunction.h"
  71. #include "llvm/CodeGen/MachineFunctionPass.h"
  72. #include "llvm/CodeGen/MachineInstr.h"
  73. #include "llvm/CodeGen/MachineInstrBuilder.h"
  74. #include "llvm/CodeGen/MachineOperand.h"
  75. #include "llvm/CodeGen/MachineRegisterInfo.h"
  76. #include "llvm/IR/DebugLoc.h"
  77. #include "llvm/Pass.h"
  78. #include "llvm/Support/Debug.h"
  79. #include "llvm/Support/MathExtras.h"
  80. #include "llvm/Support/raw_ostream.h"
  81. #include <algorithm>
  82. #include <cassert>
  83. #include <cstdlib>
  84. #include <iterator>
  85. #include <utility>
  86. using namespace llvm;
  87. #define DEBUG_TYPE "si-load-store-opt"
  88. namespace {
  89. enum InstClassEnum {
  90. UNKNOWN,
  91. DS_READ,
  92. DS_WRITE,
  93. S_BUFFER_LOAD_IMM,
  94. BUFFER_LOAD,
  95. BUFFER_STORE,
  96. };
  97. enum RegisterEnum {
  98. SBASE = 0x1,
  99. SRSRC = 0x2,
  100. SOFFSET = 0x4,
  101. VADDR = 0x8,
  102. ADDR = 0x10,
  103. };
  104. class SILoadStoreOptimizer : public MachineFunctionPass {
  105. struct CombineInfo {
  106. MachineBasicBlock::iterator I;
  107. MachineBasicBlock::iterator Paired;
  108. unsigned EltSize;
  109. unsigned Offset0;
  110. unsigned Offset1;
  111. unsigned Width0;
  112. unsigned Width1;
  113. unsigned BaseOff;
  114. InstClassEnum InstClass;
  115. bool GLC0;
  116. bool GLC1;
  117. bool SLC0;
  118. bool SLC1;
  119. bool DLC0;
  120. bool DLC1;
  121. bool UseST64;
  122. SmallVector<MachineInstr *, 8> InstsToMove;
  123. int AddrIdx[5];
  124. const MachineOperand *AddrReg[5];
  125. unsigned NumAddresses;
  126. bool hasSameBaseAddress(const MachineInstr &MI) {
  127. for (unsigned i = 0; i < NumAddresses; i++) {
  128. const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
  129. if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
  130. if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
  131. AddrReg[i]->getImm() != AddrRegNext.getImm()) {
  132. return false;
  133. }
  134. continue;
  135. }
  136. // Check same base pointer. Be careful of subregisters, which can occur
  137. // with vectors of pointers.
  138. if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
  139. AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
  140. return false;
  141. }
  142. }
  143. return true;
  144. }
  145. bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
  146. for (unsigned i = 0; i < NumAddresses; ++i) {
  147. const MachineOperand *AddrOp = AddrReg[i];
  148. // Immediates are always OK.
  149. if (AddrOp->isImm())
  150. continue;
  151. // Don't try to merge addresses that aren't either immediates or registers.
  152. // TODO: Should be possible to merge FrameIndexes and maybe some other
  153. // non-register
  154. if (!AddrOp->isReg())
  155. return false;
  156. // TODO: We should be able to merge physical reg addreses.
  157. if (Register::isPhysicalRegister(AddrOp->getReg()))
  158. return false;
  159. // If an address has only one use then there will be on other
  160. // instructions with the same address, so we can't merge this one.
  161. if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
  162. return false;
  163. }
  164. return true;
  165. }
  166. void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
  167. const GCNSubtarget &STM);
  168. void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
  169. };
  170. struct BaseRegisters {
  171. unsigned LoReg = 0;
  172. unsigned HiReg = 0;
  173. unsigned LoSubReg = 0;
  174. unsigned HiSubReg = 0;
  175. };
  176. struct MemAddress {
  177. BaseRegisters Base;
  178. int64_t Offset = 0;
  179. };
  180. using MemInfoMap = DenseMap<MachineInstr *, MemAddress>;
  181. private:
  182. const GCNSubtarget *STM = nullptr;
  183. const SIInstrInfo *TII = nullptr;
  184. const SIRegisterInfo *TRI = nullptr;
  185. MachineRegisterInfo *MRI = nullptr;
  186. AliasAnalysis *AA = nullptr;
  187. bool OptimizeAgain;
  188. static bool offsetsCanBeCombined(CombineInfo &CI);
  189. static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
  190. static unsigned getNewOpcode(const CombineInfo &CI);
  191. static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
  192. const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
  193. bool findMatchingInst(CombineInfo &CI);
  194. unsigned read2Opcode(unsigned EltSize) const;
  195. unsigned read2ST64Opcode(unsigned EltSize) const;
  196. MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI);
  197. unsigned write2Opcode(unsigned EltSize) const;
  198. unsigned write2ST64Opcode(unsigned EltSize) const;
  199. MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
  200. MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
  201. MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
  202. MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
  203. void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
  204. int32_t NewOffset) const;
  205. unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
  206. MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
  207. Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
  208. void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
  209. /// Promotes constant offset to the immediate by adjusting the base. It
  210. /// tries to use a base from the nearby instructions that allows it to have
  211. /// a 13bit constant offset which gets promoted to the immediate.
  212. bool promoteConstantOffsetToImm(MachineInstr &CI,
  213. MemInfoMap &Visited,
  214. SmallPtrSet<MachineInstr *, 4> &Promoted) const;
  215. void addInstToMergeableList(const CombineInfo &CI,
  216. std::list<std::list<CombineInfo> > &MergeableInsts) const;
  217. bool collectMergeableInsts(MachineBasicBlock &MBB,
  218. std::list<std::list<CombineInfo> > &MergeableInsts) const;
  219. public:
  220. static char ID;
  221. SILoadStoreOptimizer() : MachineFunctionPass(ID) {
  222. initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
  223. }
  224. void removeCombinedInst(std::list<CombineInfo> &MergeList,
  225. const MachineInstr &MI);
  226. bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
  227. bool &OptimizeListAgain);
  228. bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
  229. bool runOnMachineFunction(MachineFunction &MF) override;
  230. StringRef getPassName() const override { return "SI Load Store Optimizer"; }
  231. void getAnalysisUsage(AnalysisUsage &AU) const override {
  232. AU.setPreservesCFG();
  233. AU.addRequired<AAResultsWrapperPass>();
  234. MachineFunctionPass::getAnalysisUsage(AU);
  235. }
  236. };
  237. static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
  238. const unsigned Opc = MI.getOpcode();
  239. if (TII.isMUBUF(Opc)) {
  240. // FIXME: Handle d16 correctly
  241. return AMDGPU::getMUBUFElements(Opc);
  242. }
  243. switch (Opc) {
  244. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  245. return 1;
  246. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  247. return 2;
  248. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  249. return 4;
  250. default:
  251. return 0;
  252. }
  253. }
  254. /// Maps instruction opcode to enum InstClassEnum.
  255. static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
  256. switch (Opc) {
  257. default:
  258. if (TII.isMUBUF(Opc)) {
  259. switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
  260. default:
  261. return UNKNOWN;
  262. case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
  263. case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
  264. case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
  265. case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
  266. return BUFFER_LOAD;
  267. case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
  268. case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
  269. case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
  270. case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
  271. return BUFFER_STORE;
  272. }
  273. }
  274. return UNKNOWN;
  275. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  276. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  277. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  278. return S_BUFFER_LOAD_IMM;
  279. case AMDGPU::DS_READ_B32:
  280. case AMDGPU::DS_READ_B32_gfx9:
  281. case AMDGPU::DS_READ_B64:
  282. case AMDGPU::DS_READ_B64_gfx9:
  283. return DS_READ;
  284. case AMDGPU::DS_WRITE_B32:
  285. case AMDGPU::DS_WRITE_B32_gfx9:
  286. case AMDGPU::DS_WRITE_B64:
  287. case AMDGPU::DS_WRITE_B64_gfx9:
  288. return DS_WRITE;
  289. }
  290. }
  291. /// Determines instruction subclass from opcode. Only instructions
  292. /// of the same subclass can be merged together.
  293. static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
  294. switch (Opc) {
  295. default:
  296. if (TII.isMUBUF(Opc))
  297. return AMDGPU::getMUBUFBaseOpcode(Opc);
  298. return -1;
  299. case AMDGPU::DS_READ_B32:
  300. case AMDGPU::DS_READ_B32_gfx9:
  301. case AMDGPU::DS_READ_B64:
  302. case AMDGPU::DS_READ_B64_gfx9:
  303. case AMDGPU::DS_WRITE_B32:
  304. case AMDGPU::DS_WRITE_B32_gfx9:
  305. case AMDGPU::DS_WRITE_B64:
  306. case AMDGPU::DS_WRITE_B64_gfx9:
  307. return Opc;
  308. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  309. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  310. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  311. return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
  312. }
  313. }
  314. static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
  315. if (TII.isMUBUF(Opc)) {
  316. unsigned result = 0;
  317. if (AMDGPU::getMUBUFHasVAddr(Opc)) {
  318. result |= VADDR;
  319. }
  320. if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
  321. result |= SRSRC;
  322. }
  323. if (AMDGPU::getMUBUFHasSoffset(Opc)) {
  324. result |= SOFFSET;
  325. }
  326. return result;
  327. }
  328. switch (Opc) {
  329. default:
  330. return 0;
  331. case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
  332. case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
  333. case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
  334. return SBASE;
  335. case AMDGPU::DS_READ_B32:
  336. case AMDGPU::DS_READ_B64:
  337. case AMDGPU::DS_READ_B32_gfx9:
  338. case AMDGPU::DS_READ_B64_gfx9:
  339. case AMDGPU::DS_WRITE_B32:
  340. case AMDGPU::DS_WRITE_B64:
  341. case AMDGPU::DS_WRITE_B32_gfx9:
  342. case AMDGPU::DS_WRITE_B64_gfx9:
  343. return ADDR;
  344. }
  345. }
  346. void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
  347. const SIInstrInfo &TII,
  348. const GCNSubtarget &STM) {
  349. I = MI;
  350. unsigned Opc = MI->getOpcode();
  351. InstClass = getInstClass(Opc, TII);
  352. if (InstClass == UNKNOWN)
  353. return;
  354. switch (InstClass) {
  355. case DS_READ:
  356. EltSize =
  357. (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
  358. : 4;
  359. break;
  360. case DS_WRITE:
  361. EltSize =
  362. (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
  363. : 4;
  364. break;
  365. case S_BUFFER_LOAD_IMM:
  366. EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
  367. break;
  368. default:
  369. EltSize = 4;
  370. break;
  371. }
  372. int OffsetIdx =
  373. AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
  374. Offset0 = I->getOperand(OffsetIdx).getImm();
  375. Width0 = getOpcodeWidth(*I, TII);
  376. if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
  377. Offset0 &= 0xffff;
  378. } else {
  379. GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
  380. if (InstClass != S_BUFFER_LOAD_IMM) {
  381. SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
  382. }
  383. DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
  384. }
  385. unsigned AddrOpName[5] = {0};
  386. NumAddresses = 0;
  387. const unsigned Regs = getRegs(I->getOpcode(), TII);
  388. if (Regs & ADDR) {
  389. AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
  390. }
  391. if (Regs & SBASE) {
  392. AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
  393. }
  394. if (Regs & SRSRC) {
  395. AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
  396. }
  397. if (Regs & SOFFSET) {
  398. AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
  399. }
  400. if (Regs & VADDR) {
  401. AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
  402. }
  403. for (unsigned i = 0; i < NumAddresses; i++) {
  404. AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
  405. AddrReg[i] = &I->getOperand(AddrIdx[i]);
  406. }
  407. InstsToMove.clear();
  408. }
  409. void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
  410. const SIInstrInfo &TII) {
  411. Paired = MI;
  412. assert(InstClass == getInstClass(Paired->getOpcode(), TII));
  413. int OffsetIdx =
  414. AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
  415. Offset1 = Paired->getOperand(OffsetIdx).getImm();
  416. Width1 = getOpcodeWidth(*Paired, TII);
  417. if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
  418. Offset1 &= 0xffff;
  419. } else {
  420. GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
  421. if (InstClass != S_BUFFER_LOAD_IMM) {
  422. SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
  423. }
  424. DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
  425. }
  426. }
  427. } // end anonymous namespace.
  428. INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
  429. "SI Load Store Optimizer", false, false)
  430. INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
  431. INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load Store Optimizer",
  432. false, false)
  433. char SILoadStoreOptimizer::ID = 0;
  434. char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
  435. FunctionPass *llvm::createSILoadStoreOptimizerPass() {
  436. return new SILoadStoreOptimizer();
  437. }
  438. static void moveInstsAfter(MachineBasicBlock::iterator I,
  439. ArrayRef<MachineInstr *> InstsToMove) {
  440. MachineBasicBlock *MBB = I->getParent();
  441. ++I;
  442. for (MachineInstr *MI : InstsToMove) {
  443. MI->removeFromParent();
  444. MBB->insert(I, MI);
  445. }
  446. }
  447. static void addDefsUsesToList(const MachineInstr &MI,
  448. DenseSet<unsigned> &RegDefs,
  449. DenseSet<unsigned> &PhysRegUses) {
  450. for (const MachineOperand &Op : MI.operands()) {
  451. if (Op.isReg()) {
  452. if (Op.isDef())
  453. RegDefs.insert(Op.getReg());
  454. else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
  455. PhysRegUses.insert(Op.getReg());
  456. }
  457. }
  458. }
  459. static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
  460. MachineBasicBlock::iterator B,
  461. AliasAnalysis *AA) {
  462. // RAW or WAR - cannot reorder
  463. // WAW - cannot reorder
  464. // RAR - safe to reorder
  465. return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true);
  466. }
  467. // Add MI and its defs to the lists if MI reads one of the defs that are
  468. // already in the list. Returns true in that case.
  469. static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
  470. DenseSet<unsigned> &PhysRegUses,
  471. SmallVectorImpl<MachineInstr *> &Insts) {
  472. for (MachineOperand &Use : MI.operands()) {
  473. // If one of the defs is read, then there is a use of Def between I and the
  474. // instruction that I will potentially be merged with. We will need to move
  475. // this instruction after the merged instructions.
  476. //
  477. // Similarly, if there is a def which is read by an instruction that is to
  478. // be moved for merging, then we need to move the def-instruction as well.
  479. // This can only happen for physical registers such as M0; virtual
  480. // registers are in SSA form.
  481. if (Use.isReg() &&
  482. ((Use.readsReg() && RegDefs.count(Use.getReg())) ||
  483. (Use.isDef() && RegDefs.count(Use.getReg())) ||
  484. (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
  485. PhysRegUses.count(Use.getReg())))) {
  486. Insts.push_back(&MI);
  487. addDefsUsesToList(MI, RegDefs, PhysRegUses);
  488. return true;
  489. }
  490. }
  491. return false;
  492. }
  493. static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
  494. ArrayRef<MachineInstr *> InstsToMove,
  495. AliasAnalysis *AA) {
  496. assert(MemOp.mayLoadOrStore());
  497. for (MachineInstr *InstToMove : InstsToMove) {
  498. if (!InstToMove->mayLoadOrStore())
  499. continue;
  500. if (!memAccessesCanBeReordered(MemOp, *InstToMove, AA))
  501. return false;
  502. }
  503. return true;
  504. }
  505. // This function assumes that \p A and \p B have are identical except for
  506. // size and offset, and they referecne adjacent memory.
  507. static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
  508. const MachineMemOperand *A,
  509. const MachineMemOperand *B) {
  510. unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
  511. unsigned Size = A->getSize() + B->getSize();
  512. // This function adds the offset parameter to the existing offset for A,
  513. // so we pass 0 here as the offset and then manually set it to the correct
  514. // value after the call.
  515. MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
  516. MMO->setOffset(MinOffset);
  517. return MMO;
  518. }
  519. bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
  520. // XXX - Would the same offset be OK? Is there any reason this would happen or
  521. // be useful?
  522. if (CI.Offset0 == CI.Offset1)
  523. return false;
  524. // This won't be valid if the offset isn't aligned.
  525. if ((CI.Offset0 % CI.EltSize != 0) || (CI.Offset1 % CI.EltSize != 0))
  526. return false;
  527. unsigned EltOffset0 = CI.Offset0 / CI.EltSize;
  528. unsigned EltOffset1 = CI.Offset1 / CI.EltSize;
  529. CI.UseST64 = false;
  530. CI.BaseOff = 0;
  531. // Handle SMEM and VMEM instructions.
  532. if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
  533. return (EltOffset0 + CI.Width0 == EltOffset1 ||
  534. EltOffset1 + CI.Width1 == EltOffset0) &&
  535. CI.GLC0 == CI.GLC1 && CI.DLC0 == CI.DLC1 &&
  536. (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC0 == CI.SLC1);
  537. }
  538. // If the offset in elements doesn't fit in 8-bits, we might be able to use
  539. // the stride 64 versions.
  540. if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
  541. isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
  542. CI.Offset0 = EltOffset0 / 64;
  543. CI.Offset1 = EltOffset1 / 64;
  544. CI.UseST64 = true;
  545. return true;
  546. }
  547. // Check if the new offsets fit in the reduced 8-bit range.
  548. if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
  549. CI.Offset0 = EltOffset0;
  550. CI.Offset1 = EltOffset1;
  551. return true;
  552. }
  553. // Try to shift base address to decrease offsets.
  554. unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
  555. CI.BaseOff = std::min(CI.Offset0, CI.Offset1);
  556. if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
  557. CI.Offset0 = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
  558. CI.Offset1 = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
  559. CI.UseST64 = true;
  560. return true;
  561. }
  562. if (isUInt<8>(OffsetDiff)) {
  563. CI.Offset0 = EltOffset0 - CI.BaseOff / CI.EltSize;
  564. CI.Offset1 = EltOffset1 - CI.BaseOff / CI.EltSize;
  565. return true;
  566. }
  567. return false;
  568. }
  569. bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
  570. const CombineInfo &CI) {
  571. const unsigned Width = (CI.Width0 + CI.Width1);
  572. switch (CI.InstClass) {
  573. default:
  574. return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
  575. case S_BUFFER_LOAD_IMM:
  576. switch (Width) {
  577. default:
  578. return false;
  579. case 2:
  580. case 4:
  581. return true;
  582. }
  583. }
  584. }
  585. bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
  586. MachineBasicBlock *MBB = CI.I->getParent();
  587. MachineBasicBlock::iterator E = MBB->end();
  588. MachineBasicBlock::iterator MBBI = CI.I;
  589. const unsigned Opc = CI.I->getOpcode();
  590. const InstClassEnum InstClass = getInstClass(Opc, *TII);
  591. if (InstClass == UNKNOWN) {
  592. return false;
  593. }
  594. const unsigned InstSubclass = getInstSubclass(Opc, *TII);
  595. // Do not merge VMEM buffer instructions with "swizzled" bit set.
  596. int Swizzled =
  597. AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
  598. if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
  599. return false;
  600. ++MBBI;
  601. DenseSet<unsigned> RegDefsToMove;
  602. DenseSet<unsigned> PhysRegUsesToMove;
  603. addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
  604. for (; MBBI != E; ++MBBI) {
  605. if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
  606. (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
  607. // This is not a matching instruction, but we can keep looking as
  608. // long as one of these conditions are met:
  609. // 1. It is safe to move I down past MBBI.
  610. // 2. It is safe to move MBBI down past the instruction that I will
  611. // be merged into.
  612. if (MBBI->hasUnmodeledSideEffects()) {
  613. // We can't re-order this instruction with respect to other memory
  614. // operations, so we fail both conditions mentioned above.
  615. return false;
  616. }
  617. if (MBBI->mayLoadOrStore() &&
  618. (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
  619. !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
  620. // We fail condition #1, but we may still be able to satisfy condition
  621. // #2. Add this instruction to the move list and then we will check
  622. // if condition #2 holds once we have selected the matching instruction.
  623. CI.InstsToMove.push_back(&*MBBI);
  624. addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
  625. continue;
  626. }
  627. // When we match I with another DS instruction we will be moving I down
  628. // to the location of the matched instruction any uses of I will need to
  629. // be moved down as well.
  630. addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
  631. CI.InstsToMove);
  632. continue;
  633. }
  634. // Don't merge volatiles.
  635. if (MBBI->hasOrderedMemoryRef())
  636. return false;
  637. // Handle a case like
  638. // DS_WRITE_B32 addr, v, idx0
  639. // w = DS_READ_B32 addr, idx0
  640. // DS_WRITE_B32 addr, f(w), idx1
  641. // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
  642. // merging of the two writes.
  643. if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
  644. CI.InstsToMove))
  645. continue;
  646. bool Match = CI.hasSameBaseAddress(*MBBI);
  647. if (Match) {
  648. CI.setPaired(MBBI, *TII);
  649. // Check both offsets fit in the reduced range.
  650. // We also need to go through the list of instructions that we plan to
  651. // move and make sure they are all safe to move down past the merged
  652. // instruction.
  653. if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
  654. if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
  655. return true;
  656. }
  657. // We've found a load/store that we couldn't merge for some reason.
  658. // We could potentially keep looking, but we'd need to make sure that
  659. // it was safe to move I and also all the instruction in InstsToMove
  660. // down past this instruction.
  661. // check if we can move I across MBBI and if we can move all I's users
  662. if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
  663. !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
  664. break;
  665. }
  666. return false;
  667. }
  668. unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const {
  669. if (STM->ldsRequiresM0Init())
  670. return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
  671. return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
  672. }
  673. unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
  674. if (STM->ldsRequiresM0Init())
  675. return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
  676. return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
  677. : AMDGPU::DS_READ2ST64_B64_gfx9;
  678. }
  679. MachineBasicBlock::iterator
  680. SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
  681. MachineBasicBlock *MBB = CI.I->getParent();
  682. // Be careful, since the addresses could be subregisters themselves in weird
  683. // cases, like vectors of pointers.
  684. const auto *AddrReg = TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
  685. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdst);
  686. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdst);
  687. unsigned NewOffset0 = CI.Offset0;
  688. unsigned NewOffset1 = CI.Offset1;
  689. unsigned Opc =
  690. CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
  691. unsigned SubRegIdx0 = (CI.EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
  692. unsigned SubRegIdx1 = (CI.EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
  693. if (NewOffset0 > NewOffset1) {
  694. // Canonicalize the merged instruction so the smaller offset comes first.
  695. std::swap(NewOffset0, NewOffset1);
  696. std::swap(SubRegIdx0, SubRegIdx1);
  697. }
  698. assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
  699. (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
  700. const MCInstrDesc &Read2Desc = TII->get(Opc);
  701. const TargetRegisterClass *SuperRC =
  702. (CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
  703. Register DestReg = MRI->createVirtualRegister(SuperRC);
  704. DebugLoc DL = CI.I->getDebugLoc();
  705. Register BaseReg = AddrReg->getReg();
  706. unsigned BaseSubReg = AddrReg->getSubReg();
  707. unsigned BaseRegFlags = 0;
  708. if (CI.BaseOff) {
  709. Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  710. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
  711. .addImm(CI.BaseOff);
  712. BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  713. BaseRegFlags = RegState::Kill;
  714. TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
  715. .addReg(ImmReg)
  716. .addReg(AddrReg->getReg(), 0, BaseSubReg)
  717. .addImm(0); // clamp bit
  718. BaseSubReg = 0;
  719. }
  720. MachineInstrBuilder Read2 =
  721. BuildMI(*MBB, CI.Paired, DL, Read2Desc, DestReg)
  722. .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
  723. .addImm(NewOffset0) // offset0
  724. .addImm(NewOffset1) // offset1
  725. .addImm(0) // gds
  726. .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
  727. (void)Read2;
  728. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  729. // Copy to the old destination registers.
  730. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  731. .add(*Dest0) // Copy to same destination including flags and sub reg.
  732. .addReg(DestReg, 0, SubRegIdx0);
  733. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  734. .add(*Dest1)
  735. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  736. moveInstsAfter(Copy1, CI.InstsToMove);
  737. CI.I->eraseFromParent();
  738. CI.Paired->eraseFromParent();
  739. LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
  740. return Read2;
  741. }
  742. unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
  743. if (STM->ldsRequiresM0Init())
  744. return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
  745. return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
  746. : AMDGPU::DS_WRITE2_B64_gfx9;
  747. }
  748. unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
  749. if (STM->ldsRequiresM0Init())
  750. return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
  751. : AMDGPU::DS_WRITE2ST64_B64;
  752. return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
  753. : AMDGPU::DS_WRITE2ST64_B64_gfx9;
  754. }
  755. MachineBasicBlock::iterator
  756. SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
  757. MachineBasicBlock *MBB = CI.I->getParent();
  758. // Be sure to use .addOperand(), and not .addReg() with these. We want to be
  759. // sure we preserve the subregister index and any register flags set on them.
  760. const MachineOperand *AddrReg =
  761. TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
  762. const MachineOperand *Data0 =
  763. TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
  764. const MachineOperand *Data1 =
  765. TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::data0);
  766. unsigned NewOffset0 = CI.Offset0;
  767. unsigned NewOffset1 = CI.Offset1;
  768. unsigned Opc =
  769. CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
  770. if (NewOffset0 > NewOffset1) {
  771. // Canonicalize the merged instruction so the smaller offset comes first.
  772. std::swap(NewOffset0, NewOffset1);
  773. std::swap(Data0, Data1);
  774. }
  775. assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
  776. (NewOffset0 != NewOffset1) && "Computed offset doesn't fit");
  777. const MCInstrDesc &Write2Desc = TII->get(Opc);
  778. DebugLoc DL = CI.I->getDebugLoc();
  779. Register BaseReg = AddrReg->getReg();
  780. unsigned BaseSubReg = AddrReg->getSubReg();
  781. unsigned BaseRegFlags = 0;
  782. if (CI.BaseOff) {
  783. Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  784. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
  785. .addImm(CI.BaseOff);
  786. BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  787. BaseRegFlags = RegState::Kill;
  788. TII->getAddNoCarry(*MBB, CI.Paired, DL, BaseReg)
  789. .addReg(ImmReg)
  790. .addReg(AddrReg->getReg(), 0, BaseSubReg)
  791. .addImm(0); // clamp bit
  792. BaseSubReg = 0;
  793. }
  794. MachineInstrBuilder Write2 =
  795. BuildMI(*MBB, CI.Paired, DL, Write2Desc)
  796. .addReg(BaseReg, BaseRegFlags, BaseSubReg) // addr
  797. .add(*Data0) // data0
  798. .add(*Data1) // data1
  799. .addImm(NewOffset0) // offset0
  800. .addImm(NewOffset1) // offset1
  801. .addImm(0) // gds
  802. .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
  803. moveInstsAfter(Write2, CI.InstsToMove);
  804. CI.I->eraseFromParent();
  805. CI.Paired->eraseFromParent();
  806. LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
  807. return Write2;
  808. }
  809. MachineBasicBlock::iterator
  810. SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
  811. MachineBasicBlock *MBB = CI.I->getParent();
  812. DebugLoc DL = CI.I->getDebugLoc();
  813. const unsigned Opcode = getNewOpcode(CI);
  814. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  815. Register DestReg = MRI->createVirtualRegister(SuperRC);
  816. unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
  817. // It shouldn't be possible to get this far if the two instructions
  818. // don't have a single memoperand, because MachineInstr::mayAlias()
  819. // will return true if this is the case.
  820. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  821. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  822. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  823. MachineInstr *New =
  824. BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
  825. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
  826. .addImm(MergedOffset) // offset
  827. .addImm(CI.GLC0) // glc
  828. .addImm(CI.DLC0) // dlc
  829. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  830. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  831. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  832. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  833. // Copy to the old destination registers.
  834. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  835. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::sdst);
  836. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::sdst);
  837. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  838. .add(*Dest0) // Copy to same destination including flags and sub reg.
  839. .addReg(DestReg, 0, SubRegIdx0);
  840. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  841. .add(*Dest1)
  842. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  843. moveInstsAfter(Copy1, CI.InstsToMove);
  844. CI.I->eraseFromParent();
  845. CI.Paired->eraseFromParent();
  846. return New;
  847. }
  848. MachineBasicBlock::iterator
  849. SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
  850. MachineBasicBlock *MBB = CI.I->getParent();
  851. DebugLoc DL = CI.I->getDebugLoc();
  852. const unsigned Opcode = getNewOpcode(CI);
  853. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  854. // Copy to the new source register.
  855. Register DestReg = MRI->createVirtualRegister(SuperRC);
  856. unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
  857. auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
  858. const unsigned Regs = getRegs(Opcode, *TII);
  859. if (Regs & VADDR)
  860. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
  861. // It shouldn't be possible to get this far if the two instructions
  862. // don't have a single memoperand, because MachineInstr::mayAlias()
  863. // will return true if this is the case.
  864. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  865. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  866. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  867. MachineInstr *New =
  868. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
  869. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
  870. .addImm(MergedOffset) // offset
  871. .addImm(CI.GLC0) // glc
  872. .addImm(CI.SLC0) // slc
  873. .addImm(0) // tfe
  874. .addImm(CI.DLC0) // dlc
  875. .addImm(0) // swz
  876. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  877. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  878. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  879. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  880. // Copy to the old destination registers.
  881. const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
  882. const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
  883. const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
  884. BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  885. .add(*Dest0) // Copy to same destination including flags and sub reg.
  886. .addReg(DestReg, 0, SubRegIdx0);
  887. MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
  888. .add(*Dest1)
  889. .addReg(DestReg, RegState::Kill, SubRegIdx1);
  890. moveInstsAfter(Copy1, CI.InstsToMove);
  891. CI.I->eraseFromParent();
  892. CI.Paired->eraseFromParent();
  893. return New;
  894. }
  895. unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
  896. const unsigned Width = CI.Width0 + CI.Width1;
  897. switch (CI.InstClass) {
  898. default:
  899. assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
  900. // FIXME: Handle d16 correctly
  901. return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
  902. Width);
  903. case UNKNOWN:
  904. llvm_unreachable("Unknown instruction class");
  905. case S_BUFFER_LOAD_IMM:
  906. switch (Width) {
  907. default:
  908. return 0;
  909. case 2:
  910. return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
  911. case 4:
  912. return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
  913. }
  914. }
  915. }
  916. std::pair<unsigned, unsigned>
  917. SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
  918. if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
  919. return std::make_pair(0, 0);
  920. bool ReverseOrder = CI.Offset0 > CI.Offset1;
  921. static const unsigned Idxs[4][4] = {
  922. {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
  923. {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
  924. {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
  925. {AMDGPU::sub3, 0, 0, 0},
  926. };
  927. unsigned Idx0;
  928. unsigned Idx1;
  929. assert(CI.Width0 >= 1 && CI.Width0 <= 3);
  930. assert(CI.Width1 >= 1 && CI.Width1 <= 3);
  931. if (ReverseOrder) {
  932. Idx1 = Idxs[0][CI.Width1 - 1];
  933. Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
  934. } else {
  935. Idx0 = Idxs[0][CI.Width0 - 1];
  936. Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
  937. }
  938. return std::make_pair(Idx0, Idx1);
  939. }
  940. const TargetRegisterClass *
  941. SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
  942. if (CI.InstClass == S_BUFFER_LOAD_IMM) {
  943. switch (CI.Width0 + CI.Width1) {
  944. default:
  945. return nullptr;
  946. case 2:
  947. return &AMDGPU::SReg_64_XEXECRegClass;
  948. case 4:
  949. return &AMDGPU::SGPR_128RegClass;
  950. case 8:
  951. return &AMDGPU::SReg_256RegClass;
  952. case 16:
  953. return &AMDGPU::SReg_512RegClass;
  954. }
  955. } else {
  956. switch (CI.Width0 + CI.Width1) {
  957. default:
  958. return nullptr;
  959. case 2:
  960. return &AMDGPU::VReg_64RegClass;
  961. case 3:
  962. return &AMDGPU::VReg_96RegClass;
  963. case 4:
  964. return &AMDGPU::VReg_128RegClass;
  965. }
  966. }
  967. }
  968. MachineBasicBlock::iterator
  969. SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
  970. MachineBasicBlock *MBB = CI.I->getParent();
  971. DebugLoc DL = CI.I->getDebugLoc();
  972. const unsigned Opcode = getNewOpcode(CI);
  973. std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
  974. const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
  975. const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
  976. // Copy to the new source register.
  977. const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
  978. Register SrcReg = MRI->createVirtualRegister(SuperRC);
  979. const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
  980. const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
  981. BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
  982. .add(*Src0)
  983. .addImm(SubRegIdx0)
  984. .add(*Src1)
  985. .addImm(SubRegIdx1);
  986. auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
  987. .addReg(SrcReg, RegState::Kill);
  988. const unsigned Regs = getRegs(Opcode, *TII);
  989. if (Regs & VADDR)
  990. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
  991. // It shouldn't be possible to get this far if the two instructions
  992. // don't have a single memoperand, because MachineInstr::mayAlias()
  993. // will return true if this is the case.
  994. assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
  995. const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
  996. const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
  997. MachineInstr *New =
  998. MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
  999. .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
  1000. .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
  1001. .addImm(CI.GLC0) // glc
  1002. .addImm(CI.SLC0) // slc
  1003. .addImm(0) // tfe
  1004. .addImm(CI.DLC0) // dlc
  1005. .addImm(0) // swz
  1006. .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
  1007. moveInstsAfter(MIB, CI.InstsToMove);
  1008. CI.I->eraseFromParent();
  1009. CI.Paired->eraseFromParent();
  1010. return New;
  1011. }
  1012. MachineOperand
  1013. SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
  1014. APInt V(32, Val, true);
  1015. if (TII->isInlineConstant(V))
  1016. return MachineOperand::CreateImm(Val);
  1017. Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  1018. MachineInstr *Mov =
  1019. BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
  1020. TII->get(AMDGPU::S_MOV_B32), Reg)
  1021. .addImm(Val);
  1022. (void)Mov;
  1023. LLVM_DEBUG(dbgs() << " "; Mov->dump());
  1024. return MachineOperand::CreateReg(Reg, false);
  1025. }
  1026. // Compute base address using Addr and return the final register.
  1027. unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
  1028. const MemAddress &Addr) const {
  1029. MachineBasicBlock *MBB = MI.getParent();
  1030. MachineBasicBlock::iterator MBBI = MI.getIterator();
  1031. DebugLoc DL = MI.getDebugLoc();
  1032. assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
  1033. Addr.Base.LoSubReg) &&
  1034. "Expected 32-bit Base-Register-Low!!");
  1035. assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
  1036. Addr.Base.HiSubReg) &&
  1037. "Expected 32-bit Base-Register-Hi!!");
  1038. LLVM_DEBUG(dbgs() << " Re-Computed Anchor-Base:\n");
  1039. MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset), MI);
  1040. MachineOperand OffsetHi =
  1041. createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
  1042. const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
  1043. Register CarryReg = MRI->createVirtualRegister(CarryRC);
  1044. Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
  1045. Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  1046. Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
  1047. MachineInstr *LoHalf =
  1048. BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
  1049. .addReg(CarryReg, RegState::Define)
  1050. .addReg(Addr.Base.LoReg, 0, Addr.Base.LoSubReg)
  1051. .add(OffsetLo)
  1052. .addImm(0); // clamp bit
  1053. (void)LoHalf;
  1054. LLVM_DEBUG(dbgs() << " "; LoHalf->dump(););
  1055. MachineInstr *HiHalf =
  1056. BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
  1057. .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
  1058. .addReg(Addr.Base.HiReg, 0, Addr.Base.HiSubReg)
  1059. .add(OffsetHi)
  1060. .addReg(CarryReg, RegState::Kill)
  1061. .addImm(0); // clamp bit
  1062. (void)HiHalf;
  1063. LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
  1064. Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
  1065. MachineInstr *FullBase =
  1066. BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
  1067. .addReg(DestSub0)
  1068. .addImm(AMDGPU::sub0)
  1069. .addReg(DestSub1)
  1070. .addImm(AMDGPU::sub1);
  1071. (void)FullBase;
  1072. LLVM_DEBUG(dbgs() << " "; FullBase->dump(); dbgs() << "\n";);
  1073. return FullDestReg;
  1074. }
  1075. // Update base and offset with the NewBase and NewOffset in MI.
  1076. void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
  1077. unsigned NewBase,
  1078. int32_t NewOffset) const {
  1079. TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
  1080. TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
  1081. }
  1082. Optional<int32_t>
  1083. SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
  1084. if (Op.isImm())
  1085. return Op.getImm();
  1086. if (!Op.isReg())
  1087. return None;
  1088. MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
  1089. if (!Def || Def->getOpcode() != AMDGPU::S_MOV_B32 ||
  1090. !Def->getOperand(1).isImm())
  1091. return None;
  1092. return Def->getOperand(1).getImm();
  1093. }
  1094. // Analyze Base and extracts:
  1095. // - 32bit base registers, subregisters
  1096. // - 64bit constant offset
  1097. // Expecting base computation as:
  1098. // %OFFSET0:sgpr_32 = S_MOV_B32 8000
  1099. // %LO:vgpr_32, %c:sreg_64_xexec =
  1100. // V_ADD_I32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
  1101. // %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
  1102. // %Base:vreg_64 =
  1103. // REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
  1104. void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
  1105. MemAddress &Addr) const {
  1106. if (!Base.isReg())
  1107. return;
  1108. MachineInstr *Def = MRI->getUniqueVRegDef(Base.getReg());
  1109. if (!Def || Def->getOpcode() != AMDGPU::REG_SEQUENCE
  1110. || Def->getNumOperands() != 5)
  1111. return;
  1112. MachineOperand BaseLo = Def->getOperand(1);
  1113. MachineOperand BaseHi = Def->getOperand(3);
  1114. if (!BaseLo.isReg() || !BaseHi.isReg())
  1115. return;
  1116. MachineInstr *BaseLoDef = MRI->getUniqueVRegDef(BaseLo.getReg());
  1117. MachineInstr *BaseHiDef = MRI->getUniqueVRegDef(BaseHi.getReg());
  1118. if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_I32_e64 ||
  1119. !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
  1120. return;
  1121. const auto *Src0 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
  1122. const auto *Src1 = TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
  1123. auto Offset0P = extractConstOffset(*Src0);
  1124. if (Offset0P)
  1125. BaseLo = *Src1;
  1126. else {
  1127. if (!(Offset0P = extractConstOffset(*Src1)))
  1128. return;
  1129. BaseLo = *Src0;
  1130. }
  1131. Src0 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
  1132. Src1 = TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
  1133. if (Src0->isImm())
  1134. std::swap(Src0, Src1);
  1135. if (!Src1->isImm())
  1136. return;
  1137. uint64_t Offset1 = Src1->getImm();
  1138. BaseHi = *Src0;
  1139. Addr.Base.LoReg = BaseLo.getReg();
  1140. Addr.Base.HiReg = BaseHi.getReg();
  1141. Addr.Base.LoSubReg = BaseLo.getSubReg();
  1142. Addr.Base.HiSubReg = BaseHi.getSubReg();
  1143. Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
  1144. }
  1145. bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
  1146. MachineInstr &MI,
  1147. MemInfoMap &Visited,
  1148. SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
  1149. if (!(MI.mayLoad() ^ MI.mayStore()))
  1150. return false;
  1151. // TODO: Support flat and scratch.
  1152. if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
  1153. return false;
  1154. if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
  1155. return false;
  1156. if (AnchorList.count(&MI))
  1157. return false;
  1158. LLVM_DEBUG(dbgs() << "\nTryToPromoteConstantOffsetToImmFor "; MI.dump());
  1159. if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
  1160. LLVM_DEBUG(dbgs() << " Const-offset is already promoted.\n";);
  1161. return false;
  1162. }
  1163. // Step1: Find the base-registers and a 64bit constant offset.
  1164. MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
  1165. MemAddress MAddr;
  1166. if (Visited.find(&MI) == Visited.end()) {
  1167. processBaseWithConstOffset(Base, MAddr);
  1168. Visited[&MI] = MAddr;
  1169. } else
  1170. MAddr = Visited[&MI];
  1171. if (MAddr.Offset == 0) {
  1172. LLVM_DEBUG(dbgs() << " Failed to extract constant-offset or there are no"
  1173. " constant offsets that can be promoted.\n";);
  1174. return false;
  1175. }
  1176. LLVM_DEBUG(dbgs() << " BASE: {" << MAddr.Base.HiReg << ", "
  1177. << MAddr.Base.LoReg << "} Offset: " << MAddr.Offset << "\n\n";);
  1178. // Step2: Traverse through MI's basic block and find an anchor(that has the
  1179. // same base-registers) with the highest 13bit distance from MI's offset.
  1180. // E.g. (64bit loads)
  1181. // bb:
  1182. // addr1 = &a + 4096; load1 = load(addr1, 0)
  1183. // addr2 = &a + 6144; load2 = load(addr2, 0)
  1184. // addr3 = &a + 8192; load3 = load(addr3, 0)
  1185. // addr4 = &a + 10240; load4 = load(addr4, 0)
  1186. // addr5 = &a + 12288; load5 = load(addr5, 0)
  1187. //
  1188. // Starting from the first load, the optimization will try to find a new base
  1189. // from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
  1190. // has 13bit distance from &a + 4096. The heuristic considers &a + 8192
  1191. // as the new-base(anchor) because of the maximum distance which can
  1192. // accomodate more intermediate bases presumeably.
  1193. //
  1194. // Step3: move (&a + 8192) above load1. Compute and promote offsets from
  1195. // (&a + 8192) for load1, load2, load4.
  1196. // addr = &a + 8192
  1197. // load1 = load(addr, -4096)
  1198. // load2 = load(addr, -2048)
  1199. // load3 = load(addr, 0)
  1200. // load4 = load(addr, 2048)
  1201. // addr5 = &a + 12288; load5 = load(addr5, 0)
  1202. //
  1203. MachineInstr *AnchorInst = nullptr;
  1204. MemAddress AnchorAddr;
  1205. uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
  1206. SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
  1207. MachineBasicBlock *MBB = MI.getParent();
  1208. MachineBasicBlock::iterator E = MBB->end();
  1209. MachineBasicBlock::iterator MBBI = MI.getIterator();
  1210. ++MBBI;
  1211. const SITargetLowering *TLI =
  1212. static_cast<const SITargetLowering *>(STM->getTargetLowering());
  1213. for ( ; MBBI != E; ++MBBI) {
  1214. MachineInstr &MINext = *MBBI;
  1215. // TODO: Support finding an anchor(with same base) from store addresses or
  1216. // any other load addresses where the opcodes are different.
  1217. if (MINext.getOpcode() != MI.getOpcode() ||
  1218. TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
  1219. continue;
  1220. const MachineOperand &BaseNext =
  1221. *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
  1222. MemAddress MAddrNext;
  1223. if (Visited.find(&MINext) == Visited.end()) {
  1224. processBaseWithConstOffset(BaseNext, MAddrNext);
  1225. Visited[&MINext] = MAddrNext;
  1226. } else
  1227. MAddrNext = Visited[&MINext];
  1228. if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
  1229. MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
  1230. MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
  1231. MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
  1232. continue;
  1233. InstsWCommonBase.push_back(std::make_pair(&MINext, MAddrNext.Offset));
  1234. int64_t Dist = MAddr.Offset - MAddrNext.Offset;
  1235. TargetLoweringBase::AddrMode AM;
  1236. AM.HasBaseReg = true;
  1237. AM.BaseOffs = Dist;
  1238. if (TLI->isLegalGlobalAddressingMode(AM) &&
  1239. (uint32_t)std::abs(Dist) > MaxDist) {
  1240. MaxDist = std::abs(Dist);
  1241. AnchorAddr = MAddrNext;
  1242. AnchorInst = &MINext;
  1243. }
  1244. }
  1245. if (AnchorInst) {
  1246. LLVM_DEBUG(dbgs() << " Anchor-Inst(with max-distance from Offset): ";
  1247. AnchorInst->dump());
  1248. LLVM_DEBUG(dbgs() << " Anchor-Offset from BASE: "
  1249. << AnchorAddr.Offset << "\n\n");
  1250. // Instead of moving up, just re-compute anchor-instruction's base address.
  1251. unsigned Base = computeBase(MI, AnchorAddr);
  1252. updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
  1253. LLVM_DEBUG(dbgs() << " After promotion: "; MI.dump(););
  1254. for (auto P : InstsWCommonBase) {
  1255. TargetLoweringBase::AddrMode AM;
  1256. AM.HasBaseReg = true;
  1257. AM.BaseOffs = P.second - AnchorAddr.Offset;
  1258. if (TLI->isLegalGlobalAddressingMode(AM)) {
  1259. LLVM_DEBUG(dbgs() << " Promote Offset(" << P.second;
  1260. dbgs() << ")"; P.first->dump());
  1261. updateBaseAndOffset(*P.first, Base, P.second - AnchorAddr.Offset);
  1262. LLVM_DEBUG(dbgs() << " After promotion: "; P.first->dump());
  1263. }
  1264. }
  1265. AnchorList.insert(AnchorInst);
  1266. return true;
  1267. }
  1268. return false;
  1269. }
  1270. void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
  1271. std::list<std::list<CombineInfo> > &MergeableInsts) const {
  1272. for (std::list<CombineInfo> &AddrList : MergeableInsts) {
  1273. if (AddrList.front().hasSameBaseAddress(*CI.I) &&
  1274. AddrList.front().InstClass == CI.InstClass) {
  1275. AddrList.emplace_back(CI);
  1276. return;
  1277. }
  1278. }
  1279. // Base address not found, so add a new list.
  1280. MergeableInsts.emplace_back(1, CI);
  1281. }
  1282. bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
  1283. std::list<std::list<CombineInfo> > &MergeableInsts) const {
  1284. bool Modified = false;
  1285. // Contain the list
  1286. MemInfoMap Visited;
  1287. // Contains the list of instructions for which constant offsets are being
  1288. // promoted to the IMM.
  1289. SmallPtrSet<MachineInstr *, 4> AnchorList;
  1290. // Sort potential mergeable instructions into lists. One list per base address.
  1291. for (MachineInstr &MI : MBB.instrs()) {
  1292. // We run this before checking if an address is mergeable, because it can produce
  1293. // better code even if the instructions aren't mergeable.
  1294. if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
  1295. Modified = true;
  1296. const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
  1297. if (InstClass == UNKNOWN)
  1298. continue;
  1299. // Don't combine if volatile.
  1300. if (MI.hasOrderedMemoryRef())
  1301. continue;
  1302. CombineInfo CI;
  1303. CI.setMI(MI, *TII, *STM);
  1304. if (!CI.hasMergeableAddress(*MRI))
  1305. continue;
  1306. addInstToMergeableList(CI, MergeableInsts);
  1307. }
  1308. return Modified;
  1309. }
  1310. // Scan through looking for adjacent LDS operations with constant offsets from
  1311. // the same base register. We rely on the scheduler to do the hard work of
  1312. // clustering nearby loads, and assume these are all adjacent.
  1313. bool SILoadStoreOptimizer::optimizeBlock(
  1314. std::list<std::list<CombineInfo> > &MergeableInsts) {
  1315. bool Modified = false;
  1316. for (std::list<CombineInfo> &MergeList : MergeableInsts) {
  1317. if (MergeList.size() < 2)
  1318. continue;
  1319. bool OptimizeListAgain = false;
  1320. if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
  1321. // We weren't able to make any changes, so clear the list so we don't
  1322. // process the same instructions the next time we try to optimize this
  1323. // block.
  1324. MergeList.clear();
  1325. continue;
  1326. }
  1327. // We made changes, but also determined that there were no more optimization
  1328. // opportunities, so we don't need to reprocess the list
  1329. if (!OptimizeListAgain)
  1330. MergeList.clear();
  1331. OptimizeAgain |= OptimizeListAgain;
  1332. Modified = true;
  1333. }
  1334. return Modified;
  1335. }
  1336. void
  1337. SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
  1338. const MachineInstr &MI) {
  1339. for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
  1340. if (&*CI->I == &MI) {
  1341. MergeList.erase(CI);
  1342. return;
  1343. }
  1344. }
  1345. }
  1346. bool
  1347. SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
  1348. std::list<CombineInfo> &MergeList,
  1349. bool &OptimizeListAgain) {
  1350. bool Modified = false;
  1351. for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
  1352. CombineInfo &CI = *I;
  1353. switch (CI.InstClass) {
  1354. default:
  1355. break;
  1356. case DS_READ:
  1357. if (findMatchingInst(CI)) {
  1358. Modified = true;
  1359. removeCombinedInst(MergeList, *CI.Paired);
  1360. MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
  1361. CI.setMI(NewMI, *TII, *STM);
  1362. }
  1363. break;
  1364. case DS_WRITE:
  1365. if (findMatchingInst(CI)) {
  1366. Modified = true;
  1367. removeCombinedInst(MergeList, *CI.Paired);
  1368. MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
  1369. CI.setMI(NewMI, *TII, *STM);
  1370. }
  1371. break;
  1372. case S_BUFFER_LOAD_IMM:
  1373. if (findMatchingInst(CI)) {
  1374. Modified = true;
  1375. removeCombinedInst(MergeList, *CI.Paired);
  1376. MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
  1377. CI.setMI(NewMI, *TII, *STM);
  1378. OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
  1379. }
  1380. break;
  1381. case BUFFER_LOAD:
  1382. if (findMatchingInst(CI)) {
  1383. Modified = true;
  1384. removeCombinedInst(MergeList, *CI.Paired);
  1385. MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
  1386. CI.setMI(NewMI, *TII, *STM);
  1387. OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
  1388. }
  1389. break;
  1390. case BUFFER_STORE:
  1391. if (findMatchingInst(CI)) {
  1392. Modified = true;
  1393. removeCombinedInst(MergeList, *CI.Paired);
  1394. MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
  1395. CI.setMI(NewMI, *TII, *STM);
  1396. OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
  1397. }
  1398. break;
  1399. }
  1400. // Clear the InstsToMove after we have finished searching so we don't have
  1401. // stale values left over if we search for this CI again in another pass
  1402. // over the block.
  1403. CI.InstsToMove.clear();
  1404. }
  1405. return Modified;
  1406. }
  1407. bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
  1408. if (skipFunction(MF.getFunction()))
  1409. return false;
  1410. STM = &MF.getSubtarget<GCNSubtarget>();
  1411. if (!STM->loadStoreOptEnabled())
  1412. return false;
  1413. TII = STM->getInstrInfo();
  1414. TRI = &TII->getRegisterInfo();
  1415. MRI = &MF.getRegInfo();
  1416. AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
  1417. assert(MRI->isSSA() && "Must be run on SSA");
  1418. LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
  1419. bool Modified = false;
  1420. for (MachineBasicBlock &MBB : MF) {
  1421. std::list<std::list<CombineInfo> > MergeableInsts;
  1422. // First pass: Collect list of all instructions we know how to merge.
  1423. Modified |= collectMergeableInsts(MBB, MergeableInsts);
  1424. do {
  1425. OptimizeAgain = false;
  1426. Modified |= optimizeBlock(MergeableInsts);
  1427. } while (OptimizeAgain);
  1428. }
  1429. return Modified;
  1430. }