R600ControlFlowFinalizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679
  1. //===-- R600ControlFlowFinalizer.cpp - Finalize Control Flow Inst----------===//
  2. //
  3. // The LLVM Compiler Infrastructure
  4. //
  5. // This file is distributed under the University of Illinois Open Source
  6. // License. See LICENSE.TXT for details.
  7. //
  8. //===----------------------------------------------------------------------===//
  9. //
  10. /// \file
  11. /// This pass compute turns all control flow pseudo instructions into native one
  12. /// computing their address on the fly ; it also sets STACK_SIZE info.
  13. //===----------------------------------------------------------------------===//
  14. #include "llvm/Support/Debug.h"
  15. #include "AMDGPU.h"
  16. #include "AMDGPUSubtarget.h"
  17. #include "R600Defines.h"
  18. #include "R600InstrInfo.h"
  19. #include "R600MachineFunctionInfo.h"
  20. #include "R600RegisterInfo.h"
  21. #include "llvm/CodeGen/MachineFunctionPass.h"
  22. #include "llvm/CodeGen/MachineInstrBuilder.h"
  23. #include "llvm/CodeGen/MachineRegisterInfo.h"
  24. #include "llvm/Support/raw_ostream.h"
  25. using namespace llvm;
  26. #define DEBUG_TYPE "r600cf"
  27. namespace {
  28. struct CFStack {
  29. enum StackItem {
  30. ENTRY = 0,
  31. SUB_ENTRY = 1,
  32. FIRST_NON_WQM_PUSH = 2,
  33. FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
  34. };
  35. const AMDGPUSubtarget *ST;
  36. std::vector<StackItem> BranchStack;
  37. std::vector<StackItem> LoopStack;
  38. unsigned MaxStackSize;
  39. unsigned CurrentEntries;
  40. unsigned CurrentSubEntries;
  41. CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st),
  42. // We need to reserve a stack entry for CALL_FS in vertex shaders.
  43. MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
  44. CurrentEntries(0), CurrentSubEntries(0) { }
  45. unsigned getLoopDepth();
  46. bool branchStackContains(CFStack::StackItem);
  47. bool requiresWorkAroundForInst(unsigned Opcode);
  48. unsigned getSubEntrySize(CFStack::StackItem Item);
  49. void updateMaxStackSize();
  50. void pushBranch(unsigned Opcode, bool isWQM = false);
  51. void pushLoop();
  52. void popBranch();
  53. void popLoop();
  54. };
  55. unsigned CFStack::getLoopDepth() {
  56. return LoopStack.size();
  57. }
  58. bool CFStack::branchStackContains(CFStack::StackItem Item) {
  59. for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
  60. E = BranchStack.end(); I != E; ++I) {
  61. if (*I == Item)
  62. return true;
  63. }
  64. return false;
  65. }
  66. bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
  67. if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST->hasCaymanISA() &&
  68. getLoopDepth() > 1)
  69. return true;
  70. if (!ST->hasCFAluBug())
  71. return false;
  72. switch(Opcode) {
  73. default: return false;
  74. case AMDGPU::CF_ALU_PUSH_BEFORE:
  75. case AMDGPU::CF_ALU_ELSE_AFTER:
  76. case AMDGPU::CF_ALU_BREAK:
  77. case AMDGPU::CF_ALU_CONTINUE:
  78. if (CurrentSubEntries == 0)
  79. return false;
  80. if (ST->getWavefrontSize() == 64) {
  81. // We are being conservative here. We only require this work-around if
  82. // CurrentSubEntries > 3 &&
  83. // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
  84. //
  85. // We have to be conservative, because we don't know for certain that
  86. // our stack allocation algorithm for Evergreen/NI is correct. Applying this
  87. // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
  88. // resources without any problems.
  89. return CurrentSubEntries > 3;
  90. } else {
  91. assert(ST->getWavefrontSize() == 32);
  92. // We are being conservative here. We only require the work-around if
  93. // CurrentSubEntries > 7 &&
  94. // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
  95. // See the comment on the wavefront size == 64 case for why we are
  96. // being conservative.
  97. return CurrentSubEntries > 7;
  98. }
  99. }
  100. }
  101. unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
  102. switch(Item) {
  103. default:
  104. return 0;
  105. case CFStack::FIRST_NON_WQM_PUSH:
  106. assert(!ST->hasCaymanISA());
  107. if (ST->getGeneration() <= AMDGPUSubtarget::R700) {
  108. // +1 For the push operation.
  109. // +2 Extra space required.
  110. return 3;
  111. } else {
  112. // Some documentation says that this is not necessary on Evergreen,
  113. // but experimentation has show that we need to allocate 1 extra
  114. // sub-entry for the first non-WQM push.
  115. // +1 For the push operation.
  116. // +1 Extra space required.
  117. return 2;
  118. }
  119. case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
  120. assert(ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
  121. // +1 For the push operation.
  122. // +1 Extra space required.
  123. return 2;
  124. case CFStack::SUB_ENTRY:
  125. return 1;
  126. }
  127. }
  128. void CFStack::updateMaxStackSize() {
  129. unsigned CurrentStackSize =
  130. CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
  131. MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
  132. }
  133. void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
  134. CFStack::StackItem Item = CFStack::ENTRY;
  135. switch(Opcode) {
  136. case AMDGPU::CF_PUSH_EG:
  137. case AMDGPU::CF_ALU_PUSH_BEFORE:
  138. if (!isWQM) {
  139. if (!ST->hasCaymanISA() &&
  140. !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
  141. Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
  142. // See comment in
  143. // CFStack::getSubEntrySize()
  144. else if (CurrentEntries > 0 &&
  145. ST->getGeneration() > AMDGPUSubtarget::EVERGREEN &&
  146. !ST->hasCaymanISA() &&
  147. !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
  148. Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
  149. else
  150. Item = CFStack::SUB_ENTRY;
  151. } else
  152. Item = CFStack::ENTRY;
  153. break;
  154. }
  155. BranchStack.push_back(Item);
  156. if (Item == CFStack::ENTRY)
  157. CurrentEntries++;
  158. else
  159. CurrentSubEntries += getSubEntrySize(Item);
  160. updateMaxStackSize();
  161. }
  162. void CFStack::pushLoop() {
  163. LoopStack.push_back(CFStack::ENTRY);
  164. CurrentEntries++;
  165. updateMaxStackSize();
  166. }
  167. void CFStack::popBranch() {
  168. CFStack::StackItem Top = BranchStack.back();
  169. if (Top == CFStack::ENTRY)
  170. CurrentEntries--;
  171. else
  172. CurrentSubEntries-= getSubEntrySize(Top);
  173. BranchStack.pop_back();
  174. }
  175. void CFStack::popLoop() {
  176. CurrentEntries--;
  177. LoopStack.pop_back();
  178. }
  179. class R600ControlFlowFinalizer : public MachineFunctionPass {
  180. private:
  181. typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
  182. enum ControlFlowInstruction {
  183. CF_TC,
  184. CF_VC,
  185. CF_CALL_FS,
  186. CF_WHILE_LOOP,
  187. CF_END_LOOP,
  188. CF_LOOP_BREAK,
  189. CF_LOOP_CONTINUE,
  190. CF_JUMP,
  191. CF_ELSE,
  192. CF_POP,
  193. CF_END
  194. };
  195. static char ID;
  196. const R600InstrInfo *TII;
  197. const R600RegisterInfo *TRI;
  198. unsigned MaxFetchInst;
  199. const AMDGPUSubtarget *ST;
  200. bool IsTrivialInst(MachineInstr *MI) const {
  201. switch (MI->getOpcode()) {
  202. case AMDGPU::KILL:
  203. case AMDGPU::RETURN:
  204. return true;
  205. default:
  206. return false;
  207. }
  208. }
  209. const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
  210. unsigned Opcode = 0;
  211. bool isEg = (ST->getGeneration() >= AMDGPUSubtarget::EVERGREEN);
  212. switch (CFI) {
  213. case CF_TC:
  214. Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
  215. break;
  216. case CF_VC:
  217. Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
  218. break;
  219. case CF_CALL_FS:
  220. Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
  221. break;
  222. case CF_WHILE_LOOP:
  223. Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
  224. break;
  225. case CF_END_LOOP:
  226. Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
  227. break;
  228. case CF_LOOP_BREAK:
  229. Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
  230. break;
  231. case CF_LOOP_CONTINUE:
  232. Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
  233. break;
  234. case CF_JUMP:
  235. Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
  236. break;
  237. case CF_ELSE:
  238. Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
  239. break;
  240. case CF_POP:
  241. Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
  242. break;
  243. case CF_END:
  244. if (ST->hasCaymanISA()) {
  245. Opcode = AMDGPU::CF_END_CM;
  246. break;
  247. }
  248. Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
  249. break;
  250. }
  251. assert (Opcode && "No opcode selected");
  252. return TII->get(Opcode);
  253. }
  254. bool isCompatibleWithClause(const MachineInstr *MI,
  255. std::set<unsigned> &DstRegs) const {
  256. unsigned DstMI, SrcMI;
  257. for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
  258. E = MI->operands_end(); I != E; ++I) {
  259. const MachineOperand &MO = *I;
  260. if (!MO.isReg())
  261. continue;
  262. if (MO.isDef()) {
  263. unsigned Reg = MO.getReg();
  264. if (AMDGPU::R600_Reg128RegClass.contains(Reg))
  265. DstMI = Reg;
  266. else
  267. DstMI = TRI->getMatchingSuperReg(Reg,
  268. TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
  269. &AMDGPU::R600_Reg128RegClass);
  270. }
  271. if (MO.isUse()) {
  272. unsigned Reg = MO.getReg();
  273. if (AMDGPU::R600_Reg128RegClass.contains(Reg))
  274. SrcMI = Reg;
  275. else
  276. SrcMI = TRI->getMatchingSuperReg(Reg,
  277. TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
  278. &AMDGPU::R600_Reg128RegClass);
  279. }
  280. }
  281. if ((DstRegs.find(SrcMI) == DstRegs.end())) {
  282. DstRegs.insert(DstMI);
  283. return true;
  284. } else
  285. return false;
  286. }
  287. ClauseFile
  288. MakeFetchClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
  289. const {
  290. MachineBasicBlock::iterator ClauseHead = I;
  291. std::vector<MachineInstr *> ClauseContent;
  292. unsigned AluInstCount = 0;
  293. bool IsTex = TII->usesTextureCache(ClauseHead);
  294. std::set<unsigned> DstRegs;
  295. for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
  296. if (IsTrivialInst(I))
  297. continue;
  298. if (AluInstCount >= MaxFetchInst)
  299. break;
  300. if ((IsTex && !TII->usesTextureCache(I)) ||
  301. (!IsTex && !TII->usesVertexCache(I)))
  302. break;
  303. if (!isCompatibleWithClause(I, DstRegs))
  304. break;
  305. AluInstCount ++;
  306. ClauseContent.push_back(I);
  307. }
  308. MachineInstr *MIb = BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead),
  309. getHWInstrDesc(IsTex?CF_TC:CF_VC))
  310. .addImm(0) // ADDR
  311. .addImm(AluInstCount - 1); // COUNT
  312. return ClauseFile(MIb, std::move(ClauseContent));
  313. }
  314. void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
  315. static const unsigned LiteralRegs[] = {
  316. AMDGPU::ALU_LITERAL_X,
  317. AMDGPU::ALU_LITERAL_Y,
  318. AMDGPU::ALU_LITERAL_Z,
  319. AMDGPU::ALU_LITERAL_W
  320. };
  321. const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
  322. TII->getSrcs(MI);
  323. for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
  324. if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
  325. continue;
  326. int64_t Imm = Srcs[i].second;
  327. std::vector<int64_t>::iterator It =
  328. std::find(Lits.begin(), Lits.end(), Imm);
  329. if (It != Lits.end()) {
  330. unsigned Index = It - Lits.begin();
  331. Srcs[i].first->setReg(LiteralRegs[Index]);
  332. } else {
  333. assert(Lits.size() < 4 && "Too many literals in Instruction Group");
  334. Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
  335. Lits.push_back(Imm);
  336. }
  337. }
  338. }
  339. MachineBasicBlock::iterator insertLiterals(
  340. MachineBasicBlock::iterator InsertPos,
  341. const std::vector<unsigned> &Literals) const {
  342. MachineBasicBlock *MBB = InsertPos->getParent();
  343. for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
  344. unsigned LiteralPair0 = Literals[i];
  345. unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
  346. InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
  347. TII->get(AMDGPU::LITERALS))
  348. .addImm(LiteralPair0)
  349. .addImm(LiteralPair1);
  350. }
  351. return InsertPos;
  352. }
  353. ClauseFile
  354. MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
  355. const {
  356. MachineBasicBlock::iterator ClauseHead = I;
  357. std::vector<MachineInstr *> ClauseContent;
  358. I++;
  359. for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
  360. if (IsTrivialInst(I)) {
  361. ++I;
  362. continue;
  363. }
  364. if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
  365. break;
  366. std::vector<int64_t> Literals;
  367. if (I->isBundle()) {
  368. MachineInstr *DeleteMI = I;
  369. MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
  370. while (++BI != E && BI->isBundledWithPred()) {
  371. BI->unbundleFromPred();
  372. for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
  373. MachineOperand &MO = BI->getOperand(i);
  374. if (MO.isReg() && MO.isInternalRead())
  375. MO.setIsInternalRead(false);
  376. }
  377. getLiteral(&*BI, Literals);
  378. ClauseContent.push_back(&*BI);
  379. }
  380. I = BI;
  381. DeleteMI->eraseFromParent();
  382. } else {
  383. getLiteral(I, Literals);
  384. ClauseContent.push_back(I);
  385. I++;
  386. }
  387. for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
  388. unsigned literal0 = Literals[i];
  389. unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
  390. MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
  391. TII->get(AMDGPU::LITERALS))
  392. .addImm(literal0)
  393. .addImm(literal2);
  394. ClauseContent.push_back(MILit);
  395. }
  396. }
  397. assert(ClauseContent.size() < 128 && "ALU clause is too big");
  398. ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
  399. return ClauseFile(ClauseHead, std::move(ClauseContent));
  400. }
  401. void
  402. EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
  403. unsigned &CfCount) {
  404. CounterPropagateAddr(Clause.first, CfCount);
  405. MachineBasicBlock *BB = Clause.first->getParent();
  406. BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::FETCH_CLAUSE))
  407. .addImm(CfCount);
  408. for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
  409. BB->splice(InsertPos, BB, Clause.second[i]);
  410. }
  411. CfCount += 2 * Clause.second.size();
  412. }
  413. void
  414. EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
  415. unsigned &CfCount) {
  416. Clause.first->getOperand(0).setImm(0);
  417. CounterPropagateAddr(Clause.first, CfCount);
  418. MachineBasicBlock *BB = Clause.first->getParent();
  419. BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
  420. .addImm(CfCount);
  421. for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
  422. BB->splice(InsertPos, BB, Clause.second[i]);
  423. }
  424. CfCount += Clause.second.size();
  425. }
  426. void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
  427. MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
  428. }
  429. void CounterPropagateAddr(const std::set<MachineInstr *> &MIs,
  430. unsigned Addr) const {
  431. for (MachineInstr *MI : MIs) {
  432. CounterPropagateAddr(MI, Addr);
  433. }
  434. }
  435. public:
  436. R600ControlFlowFinalizer(TargetMachine &tm)
  437. : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), ST(nullptr) {}
  438. bool runOnMachineFunction(MachineFunction &MF) override {
  439. ST = &MF.getSubtarget<AMDGPUSubtarget>();
  440. MaxFetchInst = ST->getTexVTXClauseSize();
  441. TII = static_cast<const R600InstrInfo *>(ST->getInstrInfo());
  442. TRI = static_cast<const R600RegisterInfo *>(ST->getRegisterInfo());
  443. R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
  444. CFStack CFStack(ST, MFI->getShaderType());
  445. for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
  446. ++MB) {
  447. MachineBasicBlock &MBB = *MB;
  448. unsigned CfCount = 0;
  449. std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
  450. std::vector<MachineInstr * > IfThenElseStack;
  451. if (MFI->getShaderType() == ShaderType::VERTEX) {
  452. BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
  453. getHWInstrDesc(CF_CALL_FS));
  454. CfCount++;
  455. }
  456. std::vector<ClauseFile> FetchClauses, AluClauses;
  457. std::vector<MachineInstr *> LastAlu(1);
  458. std::vector<MachineInstr *> ToPopAfter;
  459. for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
  460. I != E;) {
  461. if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
  462. DEBUG(dbgs() << CfCount << ":"; I->dump(););
  463. FetchClauses.push_back(MakeFetchClause(MBB, I));
  464. CfCount++;
  465. LastAlu.back() = nullptr;
  466. continue;
  467. }
  468. MachineBasicBlock::iterator MI = I;
  469. if (MI->getOpcode() != AMDGPU::ENDIF)
  470. LastAlu.back() = nullptr;
  471. if (MI->getOpcode() == AMDGPU::CF_ALU)
  472. LastAlu.back() = MI;
  473. I++;
  474. bool RequiresWorkAround =
  475. CFStack.requiresWorkAroundForInst(MI->getOpcode());
  476. switch (MI->getOpcode()) {
  477. case AMDGPU::CF_ALU_PUSH_BEFORE:
  478. if (RequiresWorkAround) {
  479. DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
  480. BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
  481. .addImm(CfCount + 1)
  482. .addImm(1);
  483. MI->setDesc(TII->get(AMDGPU::CF_ALU));
  484. CfCount++;
  485. CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
  486. } else
  487. CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
  488. case AMDGPU::CF_ALU:
  489. I = MI;
  490. AluClauses.push_back(MakeALUClause(MBB, I));
  491. DEBUG(dbgs() << CfCount << ":"; MI->dump(););
  492. CfCount++;
  493. break;
  494. case AMDGPU::WHILELOOP: {
  495. CFStack.pushLoop();
  496. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  497. getHWInstrDesc(CF_WHILE_LOOP))
  498. .addImm(1);
  499. std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
  500. std::set<MachineInstr *>());
  501. Pair.second.insert(MIb);
  502. LoopStack.push_back(std::move(Pair));
  503. MI->eraseFromParent();
  504. CfCount++;
  505. break;
  506. }
  507. case AMDGPU::ENDLOOP: {
  508. CFStack.popLoop();
  509. std::pair<unsigned, std::set<MachineInstr *> > Pair =
  510. std::move(LoopStack.back());
  511. LoopStack.pop_back();
  512. CounterPropagateAddr(Pair.second, CfCount);
  513. BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END_LOOP))
  514. .addImm(Pair.first + 1);
  515. MI->eraseFromParent();
  516. CfCount++;
  517. break;
  518. }
  519. case AMDGPU::IF_PREDICATE_SET: {
  520. LastAlu.push_back(nullptr);
  521. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  522. getHWInstrDesc(CF_JUMP))
  523. .addImm(0)
  524. .addImm(0);
  525. IfThenElseStack.push_back(MIb);
  526. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  527. MI->eraseFromParent();
  528. CfCount++;
  529. break;
  530. }
  531. case AMDGPU::ELSE: {
  532. MachineInstr * JumpInst = IfThenElseStack.back();
  533. IfThenElseStack.pop_back();
  534. CounterPropagateAddr(JumpInst, CfCount);
  535. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  536. getHWInstrDesc(CF_ELSE))
  537. .addImm(0)
  538. .addImm(0);
  539. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  540. IfThenElseStack.push_back(MIb);
  541. MI->eraseFromParent();
  542. CfCount++;
  543. break;
  544. }
  545. case AMDGPU::ENDIF: {
  546. CFStack.popBranch();
  547. if (LastAlu.back()) {
  548. ToPopAfter.push_back(LastAlu.back());
  549. } else {
  550. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  551. getHWInstrDesc(CF_POP))
  552. .addImm(CfCount + 1)
  553. .addImm(1);
  554. (void)MIb;
  555. DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
  556. CfCount++;
  557. }
  558. MachineInstr *IfOrElseInst = IfThenElseStack.back();
  559. IfThenElseStack.pop_back();
  560. CounterPropagateAddr(IfOrElseInst, CfCount);
  561. IfOrElseInst->getOperand(1).setImm(1);
  562. LastAlu.pop_back();
  563. MI->eraseFromParent();
  564. break;
  565. }
  566. case AMDGPU::BREAK: {
  567. CfCount ++;
  568. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  569. getHWInstrDesc(CF_LOOP_BREAK))
  570. .addImm(0);
  571. LoopStack.back().second.insert(MIb);
  572. MI->eraseFromParent();
  573. break;
  574. }
  575. case AMDGPU::CONTINUE: {
  576. MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
  577. getHWInstrDesc(CF_LOOP_CONTINUE))
  578. .addImm(0);
  579. LoopStack.back().second.insert(MIb);
  580. MI->eraseFromParent();
  581. CfCount++;
  582. break;
  583. }
  584. case AMDGPU::RETURN: {
  585. BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_END));
  586. CfCount++;
  587. MI->eraseFromParent();
  588. if (CfCount % 2) {
  589. BuildMI(MBB, I, MBB.findDebugLoc(MI), TII->get(AMDGPU::PAD));
  590. CfCount++;
  591. }
  592. for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
  593. EmitFetchClause(I, FetchClauses[i], CfCount);
  594. for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
  595. EmitALUClause(I, AluClauses[i], CfCount);
  596. }
  597. default:
  598. if (TII->isExport(MI->getOpcode())) {
  599. DEBUG(dbgs() << CfCount << ":"; MI->dump(););
  600. CfCount++;
  601. }
  602. break;
  603. }
  604. }
  605. for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
  606. MachineInstr *Alu = ToPopAfter[i];
  607. BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
  608. TII->get(AMDGPU::CF_ALU_POP_AFTER))
  609. .addImm(Alu->getOperand(0).getImm())
  610. .addImm(Alu->getOperand(1).getImm())
  611. .addImm(Alu->getOperand(2).getImm())
  612. .addImm(Alu->getOperand(3).getImm())
  613. .addImm(Alu->getOperand(4).getImm())
  614. .addImm(Alu->getOperand(5).getImm())
  615. .addImm(Alu->getOperand(6).getImm())
  616. .addImm(Alu->getOperand(7).getImm())
  617. .addImm(Alu->getOperand(8).getImm());
  618. Alu->eraseFromParent();
  619. }
  620. MFI->StackSize = CFStack.MaxStackSize;
  621. }
  622. return false;
  623. }
  624. const char *getPassName() const override {
  625. return "R600 Control Flow Finalizer Pass";
  626. }
  627. };
  628. char R600ControlFlowFinalizer::ID = 0;
  629. } // end anonymous namespace
  630. llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
  631. return new R600ControlFlowFinalizer(TM);
  632. }