R600ControlFlowFinalizer.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. /// \file
  11. /// This pass compute turns all control flow pseudo instructions into native one
  12. /// computing their address on the fly ; it also sets STACK_SIZE info.
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/Support/Debug.h"
  15. #include "AMDGPU.h"
  16. #include "AMDGPUSubtarget.h"
  17. #include "R600Defines.h"
  18. #include "R600InstrInfo.h"
  19. #include "R600MachineFunctionInfo.h"
  20. #include "R600RegisterInfo.h"
  21. #include "llvm/ADT/SmallVector.h"
  22. #include "llvm/ADT/STLExtras.h"
  23. #include "llvm/ADT/StringRef.h"
  24. #include "llvm/CodeGen/MachineBasicBlock.h"
  25. #include "llvm/CodeGen/MachineFunction.h"
  26. #include "llvm/CodeGen/MachineFunctionPass.h"
  27. #include "llvm/CodeGen/MachineInstr.h"
  28. #include "llvm/CodeGen/MachineInstrBuilder.h"
  29. #include "llvm/CodeGen/MachineOperand.h"
  30. #include "llvm/IR/CallingConv.h"
  31. #include "llvm/IR/DebugLoc.h"
  32. #include "llvm/Support/MathExtras.h"
  33. #include "llvm/Support/raw_ostream.h"
  34. #include <algorithm>
  35. #include <cassert>
  36. #include <cstdint>
  37. #include <new>
  38. #include <set>
  39. #include <utility>
  40. #include <vector>
  41. using namespace llvm;
  42. #define DEBUG_TYPE "r600cf"
  43. namespace {
  44. struct CFStack {
  45. enum StackItem {
  46. ENTRY = 0,
  47. SUB_ENTRY = 1,
  48. FIRST_NON_WQM_PUSH = 2,
  49. FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
  50. };
  51. const R600Subtarget *ST;
  52. std::vector<StackItem> BranchStack;
  53. std::vector<StackItem> LoopStack;
  54. unsigned MaxStackSize;
  55. unsigned CurrentEntries = 0;
  56. unsigned CurrentSubEntries = 0;
  57. CFStack(const R600Subtarget *st, CallingConv::ID cc) : ST(st),
  58. // We need to reserve a stack entry for CALL_FS in vertex shaders.
  59. MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0) {}
  60. unsigned getLoopDepth();
  61. bool branchStackContains(CFStack::StackItem);
  62. bool requiresWorkAroundForInst(unsigned Opcode);
  63. unsigned getSubEntrySize(CFStack::StackItem Item);
  64. void updateMaxStackSize();
  65. void pushBranch(unsigned Opcode, bool isWQM = false);
  66. void pushLoop();
  67. void popBranch();
  68. void popLoop();
  69. };
  70. unsigned CFStack::getLoopDepth() {
  71. return LoopStack.size();
  72. }
  73. bool CFStack::branchStackContains(CFStack::StackItem Item) {
  74. for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
  75. E = BranchStack.end(); I != E; ++I) {
  76. if (*I == Item)
  77. return true;
  78. }
  79. return false;
  80. }
  81. bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
  82. if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
  83. getLoopDepth() > 1)
  84. return true;
  85. if (!ST->hasCFAluBug())
  86. return false;
  87. switch(Opcode) {
  88. default: return false;
  89. case AMDGPU::CF_ALU_PUSH_BEFORE:
  90. case AMDGPU::CF_ALU_ELSE_AFTER:
  91. case AMDGPU::CF_ALU_BREAK:
  92. case AMDGPU::CF_ALU_CONTINUE:
  93. if (CurrentSubEntries == 0)
  94. return false;
  95. if (ST->getWavefrontSize() == 64) {
  96. // We are being conservative here. We only require this work-around if
  97. // CurrentSubEntries > 3 &&
  98. // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
  99. //
  100. // We have to be conservative, because we don't know for certain that
  101. // our stack allocation algorithm for Evergreen/NI is correct. Applying this
  102. // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
  103. // resources without any problems.
  104. return CurrentSubEntries > 3;
  105. } else {
  106. assert(ST->getWavefrontSize() == 32);
  107. // We are being conservative here. We only require the work-around if
  108. // CurrentSubEntries > 7 &&
  109. // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
  110. // See the comment on the wavefront size == 64 case for why we are
  111. // being conservative.
  112. return CurrentSubEntries > 7;
  113. }
  114. }
  115. }
  116. unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
  117. switch(Item) {
  118. default:
  119. return 0;
  120. case CFStack::FIRST_NON_WQM_PUSH:
  121. assert(!ST->hasCaymanISA());
  122. if (ST->getGeneration() <= R600Subtarget::R700) {
  123. // +1 For the push operation.
  124. // +2 Extra space required.
  125. return 3;
  126. } else {
  127. // Some documentation says that this is not necessary on Evergreen,
  128. // but experimentation has show that we need to allocate 1 extra
  129. // sub-entry for the first non-WQM push.
  130. // +1 For the push operation.
  131. // +1 Extra space required.
  132. return 2;
  133. }
  134. case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
  135. assert(ST->getGeneration() >= R600Subtarget::EVERGREEN);
  136. // +1 For the push operation.
  137. // +1 Extra space required.
  138. return 2;
  139. case CFStack::SUB_ENTRY:
  140. return 1;
  141. }
  142. }
  143. void CFStack::updateMaxStackSize() {
  144. unsigned CurrentStackSize =
  145. CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
  146. MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
  147. }
  148. void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
  149. CFStack::StackItem Item = CFStack::ENTRY;
  150. switch(Opcode) {
  151. case AMDGPU::CF_PUSH_EG:
  152. case AMDGPU::CF_ALU_PUSH_BEFORE:
  153. if (!isWQM) {
  154. if (!ST->hasCaymanISA() &&
  155. !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
  156. Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
  157. // See comment in
  158. // CFStack::getSubEntrySize()
  159. else if (CurrentEntries > 0 &&
  160. ST->getGeneration() > R600Subtarget::EVERGREEN &&
  161. !ST->hasCaymanISA() &&
  162. !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
  163. Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
  164. else
  165. Item = CFStack::SUB_ENTRY;
  166. } else
  167. Item = CFStack::ENTRY;
  168. break;
  169. }
  170. BranchStack.push_back(Item);
  171. if (Item == CFStack::ENTRY)
  172. CurrentEntries++;
  173. else
  174. CurrentSubEntries += getSubEntrySize(Item);
  175. updateMaxStackSize();
  176. }
  177. void CFStack::pushLoop() {
  178. LoopStack.push_back(CFStack::ENTRY);
  179. CurrentEntries++;
  180. updateMaxStackSize();
  181. }
  182. void CFStack::popBranch() {
  183. CFStack::StackItem Top = BranchStack.back();
  184. if (Top == CFStack::ENTRY)
  185. CurrentEntries--;
  186. else
  187. CurrentSubEntries-= getSubEntrySize(Top);
  188. BranchStack.pop_back();
  189. }
  190. void CFStack::popLoop() {
  191. CurrentEntries--;
  192. LoopStack.pop_back();
  193. }
  194. class R600ControlFlowFinalizer : public MachineFunctionPass {
  195. private:
  196. typedef std::pair<MachineInstr *, std::vector<MachineInstr *>> ClauseFile;
  197. enum ControlFlowInstruction {
  198. CF_TC,
  199. CF_VC,
  200. CF_CALL_FS,
  201. CF_WHILE_LOOP,
  202. CF_END_LOOP,
  203. CF_LOOP_BREAK,
  204. CF_LOOP_CONTINUE,
  205. CF_JUMP,
  206. CF_ELSE,
  207. CF_POP,
  208. CF_END
  209. };
  210. static char ID;
  211. const R600InstrInfo *TII = nullptr;
  212. const R600RegisterInfo *TRI = nullptr;
  213. unsigned MaxFetchInst;
  214. const R600Subtarget *ST = nullptr;
  215. bool IsTrivialInst(MachineInstr &MI) const {
  216. switch (MI.getOpcode()) {
  217. case AMDGPU::KILL:
  218. case AMDGPU::RETURN:
  219. return true;
  220. default:
  221. return false;
  222. }
  223. }
  224. const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
  225. unsigned Opcode = 0;
  226. bool isEg = (ST->getGeneration() >= R600Subtarget::EVERGREEN);
  227. switch (CFI) {
  228. case CF_TC:
  229. Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
  230. break;
  231. case CF_VC:
  232. Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
  233. break;
  234. case CF_CALL_FS:
  235. Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
  236. break;
  237. case CF_WHILE_LOOP:
  238. Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
  239. break;
  240. case CF_END_LOOP:
  241. Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
  242. break;
  243. case CF_LOOP_BREAK:
  244. Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
  245. break;
  246. case CF_LOOP_CONTINUE:
  247. Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
  248. break;
  249. case CF_JUMP:
  250. Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
  251. break;
  252. case CF_ELSE:
  253. Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
  254. break;
  255. case CF_POP:
  256. Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
  257. break;
  258. case CF_END:
  259. if (ST->hasCaymanISA()) {
  260. Opcode = AMDGPU::CF_END_CM;
  261. break;
  262. }
  263. Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
  264. break;
  265. }
  266. assert (Opcode && "No opcode selected");
  267. return TII->get(Opcode);
  268. }
  269. bool isCompatibleWithClause(const MachineInstr &MI,
  270. std::set<unsigned> &DstRegs) const {
  271. unsigned DstMI, SrcMI;
  272. for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
  273. E = MI.operands_end();
  274. I != E; ++I) {
  275. const MachineOperand &MO = *I;
  276. if (!MO.isReg())
  277. continue;
  278. if (MO.isDef()) {
  279. unsigned Reg = MO.getReg();
  280. if (AMDGPU::R600_Reg128RegClass.contains(Reg))
  281. DstMI = Reg;
  282. else
  283. DstMI = TRI->getMatchingSuperReg(Reg,
  284. TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
  285. &AMDGPU::R600_Reg128RegClass);
  286. }
  287. if (MO.isUse()) {
  288. unsigned Reg = MO.getReg();
  289. if (AMDGPU::R600_Reg128RegClass.contains(Reg))
  290. SrcMI = Reg;
  291. else
  292. SrcMI = TRI->getMatchingSuperReg(Reg,
  293. TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
  294. &AMDGPU::R600_Reg128RegClass);
  295. }
  296. }
  297. if ((DstRegs.find(SrcMI) == DstRegs.end())) {
  298. DstRegs.insert(DstMI);
  299. return true;
  300. } else
  301. return false;
  302. }
  303. ClauseFile
  304. MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
  305. const {
  306. MachineBasicBlock::iterator ClauseHead = I;
  307. std::vector<MachineInstr *> ClauseContent;
  308. unsigned AluInstCount = 0;
  309. bool IsTex = TII->usesTextureCache(*ClauseHead);
  310. std::set<unsigned> DstRegs;
  311. for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
  312. if (IsTrivialInst(*I))
  313. continue;
  314. if (AluInstCount >= MaxFetchInst)
  315. break;
  316. if ((IsTex && !TII->usesTextureCache(*I)) ||
  317. (!IsTex && !TII->usesVertexCache(*I)))
  318. break;
  319. if (!isCompatibleWithClause(*I, DstRegs))
  320. break;
  321. AluInstCount ++;
  322. ClauseContent.push_back(&*I);
  323. }
  324. MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
  325. getHWInstrDesc(IsTex?CF_TC:CF_VC))
  326. .addImm(0) // ADDR
  327. .addImm(AluInstCount - 1); // COUNT
  328. return ClauseFile(MIb, std::move(ClauseContent));
  329. }
  330. void getLiteral(MachineInstr &MI, std::vector<MachineOperand *> &Lits) const {
  331. static const unsigned LiteralRegs[] = {
  332. AMDGPU::ALU_LITERAL_X,
  333. AMDGPU::ALU_LITERAL_Y,
  334. AMDGPU::ALU_LITERAL_Z,
  335. AMDGPU::ALU_LITERAL_W
  336. };
  337. const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs =
  338. TII->getSrcs(MI);
  339. for (const auto &Src:Srcs) {
  340. if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
  341. continue;
  342. int64_t Imm = Src.second;
  343. std::vector<MachineOperand *>::iterator It =
  344. llvm::find_if(Lits, [&](MachineOperand *val) {
  345. return val->isImm() && (val->getImm() == Imm);
  346. });
  347. // Get corresponding Operand
  348. MachineOperand &Operand = MI.getOperand(
  349. TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::literal));
  350. if (It != Lits.end()) {
  351. // Reuse existing literal reg
  352. unsigned Index = It - Lits.begin();
  353. Src.first->setReg(LiteralRegs[Index]);
  354. } else {
  355. // Allocate new literal reg
  356. assert(Lits.size() < 4 && "Too many literals in Instruction Group");
  357. Src.first->setReg(LiteralRegs[Lits.size()]);
  358. Lits.push_back(&Operand);
  359. }
  360. }
  361. }
  362. MachineBasicBlock::iterator insertLiterals(
  363. MachineBasicBlock::iterator InsertPos,
  364. const std::vector<unsigned> &Literals) const {
  365. MachineBasicBlock *MBB = InsertPos->getParent();
  366. for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
  367. unsigned LiteralPair0 = Literals[i];
  368. unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
  369. InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
  370. TII->get(AMDGPU::LITERALS))
  371. .addImm(LiteralPair0)
  372. .addImm(LiteralPair1);
  373. }
  374. return InsertPos;
  375. }
  376. ClauseFile
  377. MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
  378. const {
  379. MachineInstr &ClauseHead = *I;
  380. std::vector<MachineInstr *> ClauseContent;
  381. I++;
  382. for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
  383. if (IsTrivialInst(*I)) {
  384. ++I;
  385. continue;
  386. }
  387. if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
  388. break;
  389. std::vector<MachineOperand *>Literals;
  390. if (I->isBundle()) {
  391. MachineInstr &DeleteMI = *I;
  392. MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
  393. while (++BI != E && BI->isBundledWithPred()) {
  394. BI->unbundleFromPred();
  395. for (MachineOperand &MO : BI->operands()) {
  396. if (MO.isReg() && MO.isInternalRead())
  397. MO.setIsInternalRead(false);
  398. }
  399. getLiteral(*BI, Literals);
  400. ClauseContent.push_back(&*BI);
  401. }
  402. I = BI;
  403. DeleteMI.eraseFromParent();
  404. } else {
  405. getLiteral(*I, Literals);
  406. ClauseContent.push_back(&*I);
  407. I++;
  408. }
  409. for (unsigned i = 0, e = Literals.size(); i < e; i += 2) {
  410. MachineInstrBuilder MILit = BuildMI(MBB, I, I->getDebugLoc(),
  411. TII->get(AMDGPU::LITERALS));
  412. if (Literals[i]->isImm()) {
  413. MILit.addImm(Literals[i]->getImm());
  414. } else {
  415. MILit.addGlobalAddress(Literals[i]->getGlobal(),
  416. Literals[i]->getOffset());
  417. }
  418. if (i + 1 < e) {
  419. if (Literals[i + 1]->isImm()) {
  420. MILit.addImm(Literals[i + 1]->getImm());
  421. } else {
  422. MILit.addGlobalAddress(Literals[i + 1]->getGlobal(),
  423. Literals[i + 1]->getOffset());
  424. }
  425. } else
  426. MILit.addImm(0);
  427. ClauseContent.push_back(MILit);
  428. }
  429. }
  430. assert(ClauseContent.size() < 128 && "ALU clause is too big");
  431. ClauseHead.getOperand(7).setImm(ClauseContent.size() - 1);
  432. return ClauseFile(&ClauseHead, std::move(ClauseContent));
  433. }
  434. void EmitFetchClause(MachineBasicBlock::iterator InsertPos,
  435. const DebugLoc &DL, ClauseFile &Clause,
  436. unsigned &CfCount) {
  437. CounterPropagateAddr(*Clause.first, CfCount);
  438. MachineBasicBlock *BB = Clause.first->getParent();
  439. BuildMI(BB, DL, TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
  440. for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
  441. BB->splice(InsertPos, BB, Clause.second[i]);
  442. }
  443. CfCount += 2 * Clause.second.size();
  444. }
  445. void EmitALUClause(MachineBasicBlock::iterator InsertPos, const DebugLoc &DL,
  446. ClauseFile &Clause, unsigned &CfCount) {
  447. Clause.first->getOperand(0).setImm(0);
  448. CounterPropagateAddr(*Clause.first, CfCount);
  449. MachineBasicBlock *BB = Clause.first->getParent();
  450. BuildMI(BB, DL, TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
  451. for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
  452. BB->splice(InsertPos, BB, Clause.second[i]);
  453. }
  454. CfCount += Clause.second.size();
  455. }
  456. void CounterPropagateAddr(MachineInstr &MI, unsigned Addr) const {
  457. MI.getOperand(0).setImm(Addr + MI.getOperand(0).getImm());
  458. }
  459. void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
  460. unsigned Addr) const {
  461. for (MachineInstr *MI : MIs) {
  462. CounterPropagateAddr(*MI, Addr);
  463. }
  464. }
  465. public:
  466. R600ControlFlowFinalizer() : MachineFunctionPass(ID) {}
  467. bool runOnMachineFunction(MachineFunction &MF) override {
  468. ST = &MF.getSubtarget<R600Subtarget>();
  469. MaxFetchInst = ST->getTexVTXClauseSize();
  470. TII = ST->getInstrInfo();
  471. TRI = ST->getRegisterInfo();
  472. R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
  473. CFStack CFStack(ST, MF.getFunction()->getCallingConv());
  474. for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
  475. ++MB) {
  476. MachineBasicBlock &MBB = *MB;
  477. unsigned CfCount = 0;
  478. std::vector<std::pair<unsigned, std::set<MachineInstr *>>> LoopStack;
  479. std::vector<MachineInstr * > IfThenElseStack;
  480. if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) {
  481. BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
  482. getHWInstrDesc(CF_CALL_FS));
  483. CfCount++;
  484. }
  485. std::vector<ClauseFile> FetchClauses, AluClauses;
  486. std::vector<MachineInstr *> LastAlu(1);
  487. std::vector<MachineInstr *> ToPopAfter;
  488. for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
  489. I != E;) {
  490. if (TII->usesTextureCache(*I) || TII->usesVertexCache(*I)) {
  491. DEBUG(dbgs() << CfCount << ":"; I->dump(););
  492. FetchClauses.push_back(MakeFetchClause(MBB, I));
  493. CfCount++;
  494. LastAlu.back() = nullptr;
  495. continue;
  496. }
  497. MachineBasicBlock::iterator MI = I;
  498. if (MI->getOpcode() != AMDGPU::ENDIF)
  499. LastAlu.back() = nullptr;
  500. if (MI->getOpcode() == AMDGPU::CF_ALU)
  501. LastAlu.back() = &*MI;
  502. I++;
  503. bool RequiresWorkAround =
  504. CFStack.requiresWorkAroundForInst(MI->getOpcode());
  505. switch (MI->getOpcode()) {
  506. case AMDGPU::CF_ALU_PUSH_BEFORE:
  507. if (RequiresWorkAround) {
  508. DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
  509. BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
  510. .addImm(CfCount + 1)
  511. .addImm(1);
  512. MI->setDesc(TII->get(AMDGPU::CF_ALU));
  513. CfCount++;
  514. CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
  515. } else
  516. CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
  517. case AMDGPU::CF_ALU:
  518. I = MI;
  519. AluClauses.push_back(MakeALUClause(MBB, I));
  520. DEBUG(dbgs() << CfCount << ":"; MI->dump(););
  521. CfCount++;
  522. break;
  523. case AMDGPU::WHILELOOP: {
  524. CFStack.pushLoop();
  525. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  526. getHWInstrDesc(CF_WHILE_LOOP))
  527. .addImm(1);
  528. std::pair<unsigned, std::set<MachineInstr *>> Pair(CfCount,
  529. std::set<MachineInstr *>());
  530. Pair.second.insert(MIb);
  531. LoopStack.push_back(std::move(Pair));
  532. MI->eraseFromParent();
  533. CfCount++;
  534. break;
  535. }
  536. case AMDGPU::ENDLOOP: {
  537. CFStack.popLoop();
  538. std::pair<unsigned, std::set<MachineInstr *>> Pair =
  539. std::move(LoopStack.back());
  540. LoopStack.pop_back();
  541. CounterPropagateAddr(Pair.second, CfCount);
  542. BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
  543. .addImm(Pair.first + 1);
  544. MI->eraseFromParent();
  545. CfCount++;
  546. break;
  547. }
  548. case AMDGPU::IF_PREDICATE_SET: {
  549. LastAlu.push_back(nullptr);
  550. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  551. getHWInstrDesc(CF_JUMP))
  552. .addImm(0)
  553. .addImm(0);
  554. IfThenElseStack.push_back(MIb);
  555. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  556. MI->eraseFromParent();
  557. CfCount++;
  558. break;
  559. }
  560. case AMDGPU::ELSE: {
  561. MachineInstr * JumpInst = IfThenElseStack.back();
  562. IfThenElseStack.pop_back();
  563. CounterPropagateAddr(*JumpInst, CfCount);
  564. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  565. getHWInstrDesc(CF_ELSE))
  566. .addImm(0)
  567. .addImm(0);
  568. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  569. IfThenElseStack.push_back(MIb);
  570. MI->eraseFromParent();
  571. CfCount++;
  572. break;
  573. }
  574. case AMDGPU::ENDIF: {
  575. CFStack.popBranch();
  576. if (LastAlu.back()) {
  577. ToPopAfter.push_back(LastAlu.back());
  578. } else {
  579. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  580. getHWInstrDesc(CF_POP))
  581. .addImm(CfCount + 1)
  582. .addImm(1);
  583. (void)MIb;
  584. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  585. CfCount++;
  586. }
  587. MachineInstr *IfOrElseInst = IfThenElseStack.back();
  588. IfThenElseStack.pop_back();
  589. CounterPropagateAddr(*IfOrElseInst, CfCount);
  590. IfOrElseInst->getOperand(1).setImm(1);
  591. LastAlu.pop_back();
  592. MI->eraseFromParent();
  593. break;
  594. }
  595. case AMDGPU::BREAK: {
  596. CfCount ++;
  597. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  598. getHWInstrDesc(CF_LOOP_BREAK))
  599. .addImm(0);
  600. LoopStack.back().second.insert(MIb);
  601. MI->eraseFromParent();
  602. break;
  603. }
  604. case AMDGPU::CONTINUE: {
  605. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  606. getHWInstrDesc(CF_LOOP_CONTINUE))
  607. .addImm(0);
  608. LoopStack.back().second.insert(MIb);
  609. MI->eraseFromParent();
  610. CfCount++;
  611. break;
  612. }
  613. case AMDGPU::RETURN: {
  614. DebugLoc DL = MBB.findDebugLoc(MI);
  615. BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
  616. CfCount++;
  617. if (CfCount % 2) {
  618. BuildMI(MBB, I, DL, TII->get(AMDGPU::PAD));
  619. CfCount++;
  620. }
  621. MI->eraseFromParent();
  622. for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
  623. EmitFetchClause(I, DL, FetchClauses[i], CfCount);
  624. for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
  625. EmitALUClause(I, DL, AluClauses[i], CfCount);
  626. break;
  627. }
  628. default:
  629. if (TII->isExport(MI->getOpcode())) {
  630. DEBUG(dbgs() << CfCount << ":"; MI->dump(););
  631. CfCount++;
  632. }
  633. break;
  634. }
  635. }
  636. for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
  637. MachineInstr *Alu = ToPopAfter[i];
  638. BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
  639. TII->get(AMDGPU::CF_ALU_POP_AFTER))
  640. .addImm(Alu->getOperand(0).getImm())
  641. .addImm(Alu->getOperand(1).getImm())
  642. .addImm(Alu->getOperand(2).getImm())
  643. .addImm(Alu->getOperand(3).getImm())
  644. .addImm(Alu->getOperand(4).getImm())
  645. .addImm(Alu->getOperand(5).getImm())
  646. .addImm(Alu->getOperand(6).getImm())
  647. .addImm(Alu->getOperand(7).getImm())
  648. .addImm(Alu->getOperand(8).getImm());
  649. Alu->eraseFromParent();
  650. }
  651. MFI->CFStackSize = CFStack.MaxStackSize;
  652. }
  653. return false;
  654. }
  655. StringRef getPassName() const override {
  656. return "R600 Control Flow Finalizer Pass";
  657. }
  658. };
  659. char R600ControlFlowFinalizer::ID = 0;
  660. } // end anonymous namespace
  661. FunctionPass *llvm::createR600ControlFlowFinalizer() {
  662. return new R600ControlFlowFinalizer();
  663. }