MachineCombiner.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // The machine combiner pass uses machine trace metrics to ensure the combined
  10. // instructions do not lengthen the critical path or the resource depth.
  11. //===----------------------------------------------------------------------===//
  12. #include "llvm/ADT/DenseMap.h"
  13. #include "llvm/ADT/Statistic.h"
  14. #include "llvm/CodeGen/MachineDominators.h"
  15. #include "llvm/CodeGen/MachineFunction.h"
  16. #include "llvm/CodeGen/MachineFunctionPass.h"
  17. #include "llvm/CodeGen/MachineLoopInfo.h"
  18. #include "llvm/CodeGen/MachineRegisterInfo.h"
  19. #include "llvm/CodeGen/MachineTraceMetrics.h"
  20. #include "llvm/CodeGen/Passes.h"
  21. #include "llvm/CodeGen/TargetInstrInfo.h"
  22. #include "llvm/CodeGen/TargetRegisterInfo.h"
  23. #include "llvm/CodeGen/TargetSchedule.h"
  24. #include "llvm/CodeGen/TargetSubtargetInfo.h"
  25. #include "llvm/Support/CommandLine.h"
  26. #include "llvm/Support/Debug.h"
  27. #include "llvm/Support/raw_ostream.h"
  28. using namespace llvm;
  29. #define DEBUG_TYPE "machine-combiner"
  30. STATISTIC(NumInstCombined, "Number of machineinst combined");
  31. static cl::opt<unsigned>
  32. inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
  33. cl::desc("Incremental depth computation will be used for basic "
  34. "blocks with more instructions."), cl::init(500));
  35. static cl::opt<bool> dump_intrs("machine-combiner-dump-subst-intrs", cl::Hidden,
  36. cl::desc("Dump all substituted intrs"),
  37. cl::init(false));
  38. #ifdef EXPENSIVE_CHECKS
  39. static cl::opt<bool> VerifyPatternOrder(
  40. "machine-combiner-verify-pattern-order", cl::Hidden,
  41. cl::desc(
  42. "Verify that the generated patterns are ordered by increasing latency"),
  43. cl::init(true));
  44. #else
  45. static cl::opt<bool> VerifyPatternOrder(
  46. "machine-combiner-verify-pattern-order", cl::Hidden,
  47. cl::desc(
  48. "Verify that the generated patterns are ordered by increasing latency"),
  49. cl::init(false));
  50. #endif
  51. namespace {
  52. class MachineCombiner : public MachineFunctionPass {
  53. const TargetSubtargetInfo *STI;
  54. const TargetInstrInfo *TII;
  55. const TargetRegisterInfo *TRI;
  56. MCSchedModel SchedModel;
  57. MachineRegisterInfo *MRI;
  58. MachineLoopInfo *MLI; // Current MachineLoopInfo
  59. MachineTraceMetrics *Traces;
  60. MachineTraceMetrics::Ensemble *MinInstr;
  61. TargetSchedModel TSchedModel;
  62. /// True if optimizing for code size.
  63. bool OptSize;
  64. public:
  65. static char ID;
  66. MachineCombiner() : MachineFunctionPass(ID) {
  67. initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
  68. }
  69. void getAnalysisUsage(AnalysisUsage &AU) const override;
  70. bool runOnMachineFunction(MachineFunction &MF) override;
  71. StringRef getPassName() const override { return "Machine InstCombiner"; }
  72. private:
  73. bool doSubstitute(unsigned NewSize, unsigned OldSize);
  74. bool combineInstructions(MachineBasicBlock *);
  75. MachineInstr *getOperandDef(const MachineOperand &MO);
  76. unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  77. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  78. MachineTraceMetrics::Trace BlockTrace);
  79. unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  80. MachineTraceMetrics::Trace BlockTrace);
  81. bool
  82. improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
  83. MachineTraceMetrics::Trace BlockTrace,
  84. SmallVectorImpl<MachineInstr *> &InsInstrs,
  85. SmallVectorImpl<MachineInstr *> &DelInstrs,
  86. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  87. MachineCombinerPattern Pattern, bool SlackIsAccurate);
  88. bool preservesResourceLen(MachineBasicBlock *MBB,
  89. MachineTraceMetrics::Trace BlockTrace,
  90. SmallVectorImpl<MachineInstr *> &InsInstrs,
  91. SmallVectorImpl<MachineInstr *> &DelInstrs);
  92. void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
  93. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
  94. std::pair<unsigned, unsigned>
  95. getLatenciesForInstrSequences(MachineInstr &MI,
  96. SmallVectorImpl<MachineInstr *> &InsInstrs,
  97. SmallVectorImpl<MachineInstr *> &DelInstrs,
  98. MachineTraceMetrics::Trace BlockTrace);
  99. void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
  100. SmallVector<MachineCombinerPattern, 16> &Patterns);
  101. };
  102. }
  103. char MachineCombiner::ID = 0;
  104. char &llvm::MachineCombinerID = MachineCombiner::ID;
  105. INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
  106. "Machine InstCombiner", false, false)
  107. INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
  108. INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
  109. INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
  110. false, false)
  111. void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  112. AU.setPreservesCFG();
  113. AU.addPreserved<MachineDominatorTree>();
  114. AU.addRequired<MachineLoopInfo>();
  115. AU.addPreserved<MachineLoopInfo>();
  116. AU.addRequired<MachineTraceMetrics>();
  117. AU.addPreserved<MachineTraceMetrics>();
  118. MachineFunctionPass::getAnalysisUsage(AU);
  119. }
  120. MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
  121. MachineInstr *DefInstr = nullptr;
  122. // We need a virtual register definition.
  123. if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
  124. DefInstr = MRI->getUniqueVRegDef(MO.getReg());
  125. // PHI's have no depth etc.
  126. if (DefInstr && DefInstr->isPHI())
  127. DefInstr = nullptr;
  128. return DefInstr;
  129. }
  130. /// Computes depth of instructions in vector \InsInstr.
  131. ///
  132. /// \param InsInstrs is a vector of machine instructions
  133. /// \param InstrIdxForVirtReg is a dense map of virtual register to index
  134. /// of defining machine instruction in \p InsInstrs
  135. /// \param BlockTrace is a trace of machine instructions
  136. ///
  137. /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
  138. unsigned
  139. MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  140. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  141. MachineTraceMetrics::Trace BlockTrace) {
  142. SmallVector<unsigned, 16> InstrDepth;
  143. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  144. "Missing machine model\n");
  145. // For each instruction in the new sequence compute the depth based on the
  146. // operands. Use the trace information when possible. For new operands which
  147. // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
  148. for (auto *InstrPtr : InsInstrs) { // for each Use
  149. unsigned IDepth = 0;
  150. for (const MachineOperand &MO : InstrPtr->operands()) {
  151. // Check for virtual register operand.
  152. if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
  153. continue;
  154. if (!MO.isUse())
  155. continue;
  156. unsigned DepthOp = 0;
  157. unsigned LatencyOp = 0;
  158. DenseMap<unsigned, unsigned>::iterator II =
  159. InstrIdxForVirtReg.find(MO.getReg());
  160. if (II != InstrIdxForVirtReg.end()) {
  161. // Operand is new virtual register not in trace
  162. assert(II->second < InstrDepth.size() && "Bad Index");
  163. MachineInstr *DefInstr = InsInstrs[II->second];
  164. assert(DefInstr &&
  165. "There must be a definition for a new virtual register");
  166. DepthOp = InstrDepth[II->second];
  167. int DefIdx = DefInstr->findRegisterDefOperandIdx(MO.getReg());
  168. int UseIdx = InstrPtr->findRegisterUseOperandIdx(MO.getReg());
  169. LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx,
  170. InstrPtr, UseIdx);
  171. } else {
  172. MachineInstr *DefInstr = getOperandDef(MO);
  173. if (DefInstr) {
  174. DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
  175. LatencyOp = TSchedModel.computeOperandLatency(
  176. DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
  177. InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
  178. }
  179. }
  180. IDepth = std::max(IDepth, DepthOp + LatencyOp);
  181. }
  182. InstrDepth.push_back(IDepth);
  183. }
  184. unsigned NewRootIdx = InsInstrs.size() - 1;
  185. return InstrDepth[NewRootIdx];
  186. }
  187. /// Computes instruction latency as max of latency of defined operands.
  188. ///
  189. /// \param Root is a machine instruction that could be replaced by NewRoot.
  190. /// It is used to compute a more accurate latency information for NewRoot in
  191. /// case there is a dependent instruction in the same trace (\p BlockTrace)
  192. /// \param NewRoot is the instruction for which the latency is computed
  193. /// \param BlockTrace is a trace of machine instructions
  194. ///
  195. /// \returns Latency of \p NewRoot
  196. unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  197. MachineTraceMetrics::Trace BlockTrace) {
  198. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  199. "Missing machine model\n");
  200. // Check each definition in NewRoot and compute the latency
  201. unsigned NewRootLatency = 0;
  202. for (const MachineOperand &MO : NewRoot->operands()) {
  203. // Check for virtual register operand.
  204. if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
  205. continue;
  206. if (!MO.isDef())
  207. continue;
  208. // Get the first instruction that uses MO
  209. MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
  210. RI++;
  211. if (RI == MRI->reg_end())
  212. continue;
  213. MachineInstr *UseMO = RI->getParent();
  214. unsigned LatencyOp = 0;
  215. if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
  216. LatencyOp = TSchedModel.computeOperandLatency(
  217. NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
  218. UseMO->findRegisterUseOperandIdx(MO.getReg()));
  219. } else {
  220. LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
  221. }
  222. NewRootLatency = std::max(NewRootLatency, LatencyOp);
  223. }
  224. return NewRootLatency;
  225. }
  226. /// The combiner's goal may differ based on which pattern it is attempting
  227. /// to optimize.
  228. enum class CombinerObjective {
  229. MustReduceDepth, // The data dependency chain must be improved.
  230. Default // The critical path must not be lengthened.
  231. };
  232. static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
  233. // TODO: If C++ ever gets a real enum class, make this part of the
  234. // MachineCombinerPattern class.
  235. switch (P) {
  236. case MachineCombinerPattern::REASSOC_AX_BY:
  237. case MachineCombinerPattern::REASSOC_AX_YB:
  238. case MachineCombinerPattern::REASSOC_XA_BY:
  239. case MachineCombinerPattern::REASSOC_XA_YB:
  240. return CombinerObjective::MustReduceDepth;
  241. default:
  242. return CombinerObjective::Default;
  243. }
  244. }
  245. /// Estimate the latency of the new and original instruction sequence by summing
  246. /// up the latencies of the inserted and deleted instructions. This assumes
  247. /// that the inserted and deleted instructions are dependent instruction chains,
  248. /// which might not hold in all cases.
  249. std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
  250. MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
  251. SmallVectorImpl<MachineInstr *> &DelInstrs,
  252. MachineTraceMetrics::Trace BlockTrace) {
  253. assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
  254. unsigned NewRootLatency = 0;
  255. // NewRoot is the last instruction in the \p InsInstrs vector.
  256. MachineInstr *NewRoot = InsInstrs.back();
  257. for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
  258. NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
  259. NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
  260. unsigned RootLatency = 0;
  261. for (auto I : DelInstrs)
  262. RootLatency += TSchedModel.computeInstrLatency(I);
  263. return {NewRootLatency, RootLatency};
  264. }
  265. /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
  266. /// The new code sequence ends in MI NewRoot. A necessary condition for the new
  267. /// sequence to replace the old sequence is that it cannot lengthen the critical
  268. /// path. The definition of "improve" may be restricted by specifying that the
  269. /// new path improves the data dependency chain (MustReduceDepth).
  270. bool MachineCombiner::improvesCriticalPathLen(
  271. MachineBasicBlock *MBB, MachineInstr *Root,
  272. MachineTraceMetrics::Trace BlockTrace,
  273. SmallVectorImpl<MachineInstr *> &InsInstrs,
  274. SmallVectorImpl<MachineInstr *> &DelInstrs,
  275. DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  276. MachineCombinerPattern Pattern,
  277. bool SlackIsAccurate) {
  278. assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
  279. "Missing machine model\n");
  280. // Get depth and latency of NewRoot and Root.
  281. unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
  282. unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
  283. LLVM_DEBUG(dbgs() << " Dependence data for " << *Root << "\tNewRootDepth: "
  284. << NewRootDepth << "\tRootDepth: " << RootDepth);
  285. // For a transform such as reassociation, the cost equation is
  286. // conservatively calculated so that we must improve the depth (data
  287. // dependency cycles) in the critical path to proceed with the transform.
  288. // Being conservative also protects against inaccuracies in the underlying
  289. // machine trace metrics and CPU models.
  290. if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth) {
  291. LLVM_DEBUG(dbgs() << "\tIt MustReduceDepth ");
  292. LLVM_DEBUG(NewRootDepth < RootDepth
  293. ? dbgs() << "\t and it does it\n"
  294. : dbgs() << "\t but it does NOT do it\n");
  295. return NewRootDepth < RootDepth;
  296. }
  297. // A more flexible cost calculation for the critical path includes the slack
  298. // of the original code sequence. This may allow the transform to proceed
  299. // even if the instruction depths (data dependency cycles) become worse.
  300. // Account for the latency of the inserted and deleted instructions by
  301. unsigned NewRootLatency, RootLatency;
  302. std::tie(NewRootLatency, RootLatency) =
  303. getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
  304. unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
  305. unsigned NewCycleCount = NewRootDepth + NewRootLatency;
  306. unsigned OldCycleCount =
  307. RootDepth + RootLatency + (SlackIsAccurate ? RootSlack : 0);
  308. LLVM_DEBUG(dbgs() << "\n\tNewRootLatency: " << NewRootLatency
  309. << "\tRootLatency: " << RootLatency << "\n\tRootSlack: "
  310. << RootSlack << " SlackIsAccurate=" << SlackIsAccurate
  311. << "\n\tNewRootDepth + NewRootLatency = " << NewCycleCount
  312. << "\n\tRootDepth + RootLatency + RootSlack = "
  313. << OldCycleCount;);
  314. LLVM_DEBUG(NewCycleCount <= OldCycleCount
  315. ? dbgs() << "\n\t It IMPROVES PathLen because"
  316. : dbgs() << "\n\t It DOES NOT improve PathLen because");
  317. LLVM_DEBUG(dbgs() << "\n\t\tNewCycleCount = " << NewCycleCount
  318. << ", OldCycleCount = " << OldCycleCount << "\n");
  319. return NewCycleCount <= OldCycleCount;
  320. }
  321. /// helper routine to convert instructions into SC
  322. void MachineCombiner::instr2instrSC(
  323. SmallVectorImpl<MachineInstr *> &Instrs,
  324. SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
  325. for (auto *InstrPtr : Instrs) {
  326. unsigned Opc = InstrPtr->getOpcode();
  327. unsigned Idx = TII->get(Opc).getSchedClass();
  328. const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
  329. InstrsSC.push_back(SC);
  330. }
  331. }
  332. /// True when the new instructions do not increase resource length
  333. bool MachineCombiner::preservesResourceLen(
  334. MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
  335. SmallVectorImpl<MachineInstr *> &InsInstrs,
  336. SmallVectorImpl<MachineInstr *> &DelInstrs) {
  337. if (!TSchedModel.hasInstrSchedModel())
  338. return true;
  339. // Compute current resource length
  340. //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
  341. SmallVector <const MachineBasicBlock *, 1> MBBarr;
  342. MBBarr.push_back(MBB);
  343. unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
  344. // Deal with SC rather than Instructions.
  345. SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
  346. SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
  347. instr2instrSC(InsInstrs, InsInstrsSC);
  348. instr2instrSC(DelInstrs, DelInstrsSC);
  349. ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
  350. ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
  351. // Compute new resource length.
  352. unsigned ResLenAfterCombine =
  353. BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
  354. LLVM_DEBUG(dbgs() << "\t\tResource length before replacement: "
  355. << ResLenBeforeCombine
  356. << " and after: " << ResLenAfterCombine << "\n";);
  357. LLVM_DEBUG(
  358. ResLenAfterCombine <= ResLenBeforeCombine
  359. ? dbgs() << "\t\t As result it IMPROVES/PRESERVES Resource Length\n"
  360. : dbgs() << "\t\t As result it DOES NOT improve/preserve Resource "
  361. "Length\n");
  362. return ResLenAfterCombine <= ResLenBeforeCombine;
  363. }
  364. /// \returns true when new instruction sequence should be generated
  365. /// independent if it lengthens critical path or not
  366. bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
  367. if (OptSize && (NewSize < OldSize))
  368. return true;
  369. if (!TSchedModel.hasInstrSchedModelOrItineraries())
  370. return true;
  371. return false;
  372. }
  373. /// Inserts InsInstrs and deletes DelInstrs. Incrementally updates instruction
  374. /// depths if requested.
  375. ///
  376. /// \param MBB basic block to insert instructions in
  377. /// \param MI current machine instruction
  378. /// \param InsInstrs new instructions to insert in \p MBB
  379. /// \param DelInstrs instruction to delete from \p MBB
  380. /// \param MinInstr is a pointer to the machine trace information
  381. /// \param RegUnits set of live registers, needed to compute instruction depths
  382. /// \param IncrementalUpdate if true, compute instruction depths incrementally,
  383. /// otherwise invalidate the trace
  384. static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
  385. SmallVector<MachineInstr *, 16> InsInstrs,
  386. SmallVector<MachineInstr *, 16> DelInstrs,
  387. MachineTraceMetrics::Ensemble *MinInstr,
  388. SparseSet<LiveRegUnit> &RegUnits,
  389. bool IncrementalUpdate) {
  390. for (auto *InstrPtr : InsInstrs)
  391. MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
  392. for (auto *InstrPtr : DelInstrs) {
  393. InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
  394. // Erase all LiveRegs defined by the removed instruction
  395. for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
  396. if (I->MI == InstrPtr)
  397. I = RegUnits.erase(I);
  398. else
  399. I++;
  400. }
  401. }
  402. if (IncrementalUpdate)
  403. for (auto *InstrPtr : InsInstrs)
  404. MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
  405. else
  406. MinInstr->invalidate(MBB);
  407. NumInstCombined++;
  408. }
  409. // Check that the difference between original and new latency is decreasing for
  410. // later patterns. This helps to discover sub-optimal pattern orderings.
  411. void MachineCombiner::verifyPatternOrder(
  412. MachineBasicBlock *MBB, MachineInstr &Root,
  413. SmallVector<MachineCombinerPattern, 16> &Patterns) {
  414. long PrevLatencyDiff = std::numeric_limits<long>::max();
  415. (void)PrevLatencyDiff; // Variable is used in assert only.
  416. for (auto P : Patterns) {
  417. SmallVector<MachineInstr *, 16> InsInstrs;
  418. SmallVector<MachineInstr *, 16> DelInstrs;
  419. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  420. TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
  421. InstrIdxForVirtReg);
  422. // Found pattern, but did not generate alternative sequence.
  423. // This can happen e.g. when an immediate could not be materialized
  424. // in a single instruction.
  425. if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
  426. continue;
  427. unsigned NewRootLatency, RootLatency;
  428. std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
  429. Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
  430. long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
  431. assert(CurrentLatencyDiff <= PrevLatencyDiff &&
  432. "Current pattern is better than previous pattern.");
  433. PrevLatencyDiff = CurrentLatencyDiff;
  434. }
  435. }
  436. /// Substitute a slow code sequence with a faster one by
  437. /// evaluating instruction combining pattern.
  438. /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
  439. /// combining based on machine trace metrics. Only combine a sequence of
  440. /// instructions when this neither lengthens the critical path nor increases
  441. /// resource pressure. When optimizing for codesize always combine when the new
  442. /// sequence is shorter.
  443. bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
  444. bool Changed = false;
  445. LLVM_DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
  446. bool IncrementalUpdate = false;
  447. auto BlockIter = MBB->begin();
  448. decltype(BlockIter) LastUpdate;
  449. // Check if the block is in a loop.
  450. const MachineLoop *ML = MLI->getLoopFor(MBB);
  451. if (!MinInstr)
  452. MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
  453. SparseSet<LiveRegUnit> RegUnits;
  454. RegUnits.setUniverse(TRI->getNumRegUnits());
  455. while (BlockIter != MBB->end()) {
  456. auto &MI = *BlockIter++;
  457. SmallVector<MachineCombinerPattern, 16> Patterns;
  458. // The motivating example is:
  459. //
  460. // MUL Other MUL_op1 MUL_op2 Other
  461. // \ / \ | /
  462. // ADD/SUB => MADD/MSUB
  463. // (=Root) (=NewRoot)
  464. // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
  465. // usually beneficial for code size it unfortunately can hurt performance
  466. // when the ADD is on the critical path, but the MUL is not. With the
  467. // substitution the MUL becomes part of the critical path (in form of the
  468. // MADD) and can lengthen it on architectures where the MADD latency is
  469. // longer than the ADD latency.
  470. //
  471. // For each instruction we check if it can be the root of a combiner
  472. // pattern. Then for each pattern the new code sequence in form of MI is
  473. // generated and evaluated. When the efficiency criteria (don't lengthen
  474. // critical path, don't use more resources) is met the new sequence gets
  475. // hooked up into the basic block before the old sequence is removed.
  476. //
  477. // The algorithm does not try to evaluate all patterns and pick the best.
  478. // This is only an artificial restriction though. In practice there is
  479. // mostly one pattern, and getMachineCombinerPatterns() can order patterns
  480. // based on an internal cost heuristic. If
  481. // machine-combiner-verify-pattern-order is enabled, all patterns are
  482. // checked to ensure later patterns do not provide better latency savings.
  483. if (!TII->getMachineCombinerPatterns(MI, Patterns))
  484. continue;
  485. if (VerifyPatternOrder)
  486. verifyPatternOrder(MBB, MI, Patterns);
  487. for (auto P : Patterns) {
  488. SmallVector<MachineInstr *, 16> InsInstrs;
  489. SmallVector<MachineInstr *, 16> DelInstrs;
  490. DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
  491. TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
  492. InstrIdxForVirtReg);
  493. unsigned NewInstCount = InsInstrs.size();
  494. unsigned OldInstCount = DelInstrs.size();
  495. // Found pattern, but did not generate alternative sequence.
  496. // This can happen e.g. when an immediate could not be materialized
  497. // in a single instruction.
  498. if (!NewInstCount)
  499. continue;
  500. LLVM_DEBUG(if (dump_intrs) {
  501. dbgs() << "\tFor the Pattern (" << (int)P
  502. << ") these instructions could be removed\n";
  503. for (auto const *InstrPtr : DelInstrs)
  504. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  505. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  506. dbgs() << "\tThese instructions could replace the removed ones\n";
  507. for (auto const *InstrPtr : InsInstrs)
  508. InstrPtr->print(dbgs(), /*IsStandalone*/false, /*SkipOpers*/false,
  509. /*SkipDebugLoc*/false, /*AddNewLine*/true, TII);
  510. });
  511. bool SubstituteAlways = false;
  512. if (ML && TII->isThroughputPattern(P))
  513. SubstituteAlways = true;
  514. if (IncrementalUpdate) {
  515. // Update depths since the last incremental update.
  516. MinInstr->updateDepths(LastUpdate, BlockIter, RegUnits);
  517. LastUpdate = BlockIter;
  518. }
  519. // Substitute when we optimize for codesize and the new sequence has
  520. // fewer instructions OR
  521. // the new sequence neither lengthens the critical path nor increases
  522. // resource pressure.
  523. if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
  524. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  525. RegUnits, IncrementalUpdate);
  526. // Eagerly stop after the first pattern fires.
  527. Changed = true;
  528. break;
  529. } else {
  530. // For big basic blocks, we only compute the full trace the first time
  531. // we hit this. We do not invalidate the trace, but instead update the
  532. // instruction depths incrementally.
  533. // NOTE: Only the instruction depths up to MI are accurate. All other
  534. // trace information is not updated.
  535. MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
  536. Traces->verifyAnalysis();
  537. if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
  538. InstrIdxForVirtReg, P,
  539. !IncrementalUpdate) &&
  540. preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
  541. if (MBB->size() > inc_threshold) {
  542. // Use incremental depth updates for basic blocks above treshold
  543. IncrementalUpdate = true;
  544. LastUpdate = BlockIter;
  545. }
  546. insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
  547. RegUnits, IncrementalUpdate);
  548. // Eagerly stop after the first pattern fires.
  549. Changed = true;
  550. break;
  551. }
  552. // Cleanup instructions of the alternative code sequence. There is no
  553. // use for them.
  554. MachineFunction *MF = MBB->getParent();
  555. for (auto *InstrPtr : InsInstrs)
  556. MF->DeleteMachineInstr(InstrPtr);
  557. }
  558. InstrIdxForVirtReg.clear();
  559. }
  560. }
  561. if (Changed && IncrementalUpdate)
  562. Traces->invalidate(MBB);
  563. return Changed;
  564. }
  565. bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
  566. STI = &MF.getSubtarget();
  567. TII = STI->getInstrInfo();
  568. TRI = STI->getRegisterInfo();
  569. SchedModel = STI->getSchedModel();
  570. TSchedModel.init(STI);
  571. MRI = &MF.getRegInfo();
  572. MLI = &getAnalysis<MachineLoopInfo>();
  573. Traces = &getAnalysis<MachineTraceMetrics>();
  574. MinInstr = nullptr;
  575. OptSize = MF.getFunction().hasOptSize();
  576. LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
  577. if (!TII->useMachineCombiner()) {
  578. LLVM_DEBUG(
  579. dbgs()
  580. << " Skipping pass: Target does not support machine combiner\n");
  581. return false;
  582. }
  583. bool Changed = false;
  584. // Try to combine instructions.
  585. for (auto &MBB : MF)
  586. Changed |= combineInstructions(&MBB);
  587. return Changed;
  588. }