5 年之前 · ab7e61c53f
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
 
															       return true;
														
 
															     }
														
 
															+    bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
														
 
															+      for (unsigned i = 0; i < NumAddresses; ++i) {
														
 
															+        const MachineOperand *AddrOp = AddrReg[i];
														
 
															+        // Immediates are always OK.
														
 
															+        if (AddrOp->isImm())
														
 
															+          continue;
														
 
															+
														
 
															+        // Don't try to merge addresses that aren't either immediates or registers.
														
 
															+        // TODO: Should be possible to merge FrameIndexes and maybe some other
														
 
															+        // non-register
														
 
															+        if (!AddrOp->isReg())
														
 
															+          return false;
														
 
															+
														
 
															+        // TODO: We should be able to merge physical reg addreses.
														
 
															+        if (Register::isPhysicalRegister(AddrOp->getReg()))
														
 
															+          return false;
														
 
															+
														
 
															+        // If an address has only one use then there will be on other
														
 
															+        // instructions with the same address, so we can't merge this one.
														
 
															+        if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
														
 
															+          return false;
														
 
															+      }
														
 
															+      return true;
														
 
															+    }
														
 
															+
														
 
															     void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
														
 
															                const GCNSubtarget &STM);
														
 
															     void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
														
@@ -220,6 +245,10 @@ private:
 
															   bool promoteConstantOffsetToImm(MachineInstr &CI,
														
 
															                                   MemInfoMap &Visited,
														
 
															                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
														
 
															+  void addInstToMergeableList(const CombineInfo &CI,
														
 
															+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
														
 
															+  bool collectMergeableInsts(MachineBasicBlock &MBB,
														
 
															+                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
														
 
															 public:
														
 
															   static char ID;
														
@@ -228,7 +257,11 @@ public:
 
															     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
														
 
															   }
														
 
															-  bool optimizeBlock(MachineBasicBlock &MBB);
														
 
															+  void removeCombinedInst(std::list<CombineInfo> &MergeList,
														
 
															+                                         const MachineInstr &MI);
														
 
															+  bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
														
 
															+                                     bool &OptimizeListAgain);
														
 
															+  bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
														
 
															   bool runOnMachineFunction(MachineFunction &MF) override;
														
@@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
 
															     AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
														
 
															     AddrReg[i] = &I->getOperand(AddrIdx[i]);
														
 
															   }
														
 
															+
														
 
															+  InstsToMove.clear();
														
 
															 }
														
 
															 void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
														
@@ -646,15 +681,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
 
															   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
														
 
															     return false;
														
 
															-  for (unsigned i = 0; i < CI.NumAddresses; i++) {
														
 
															-    // We only ever merge operations with the same base address register, so
														
 
															-    // don't bother scanning forward if there are no other uses.
														
 
															-    if (CI.AddrReg[i]->isReg() &&
														
 
															-        (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) ||
														
 
															-         MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg())))
														
 
															-      return false;
														
 
															-  }
														
 
															-
														
 
															   ++MBBI;
														
 
															   DenseSet<unsigned> RegDefsToMove;
														
@@ -827,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
 
															   moveInstsAfter(Copy1, CI.InstsToMove);
														
 
															-  MachineBasicBlock::iterator Next = std::next(CI.I);
														
 
															   CI.I->eraseFromParent();
														
 
															   CI.Paired->eraseFromParent();
														
 
															   LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
														
 
															-  return Next;
														
 
															+  return Read2;
														
 
															 }
														
 
															 unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
														
@@ -911,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
 
															   moveInstsAfter(Write2, CI.InstsToMove);
														
 
															-  MachineBasicBlock::iterator Next = std::next(CI.I);
														
 
															   CI.I->eraseFromParent();
														
 
															   CI.Paired->eraseFromParent();
														
 
															   LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
														
 
															-  return Next;
														
 
															+  return Write2;
														
 
															 }
														
 
															 MachineBasicBlock::iterator
														
@@ -938,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
 
															   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
														
 
															   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
														
 
															-  BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
														
 
															-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
														
 
															-      .addImm(MergedOffset) // offset
														
 
															-      .addImm(CI.GLC0)      // glc
														
 
															-      .addImm(CI.DLC0)      // dlc
														
 
															-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															+  MachineInstr *New =
														
 
															+    BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
														
 
															+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
														
 
															+        .addImm(MergedOffset) // offset
														
 
															+        .addImm(CI.GLC0)      // glc
														
 
															+        .addImm(CI.DLC0)      // dlc
														
 
															+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
														
 
															   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
														
@@ -963,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
 
															   moveInstsAfter(Copy1, CI.InstsToMove);
														
 
															-  MachineBasicBlock::iterator Next = std::next(CI.I);
														
 
															   CI.I->eraseFromParent();
														
 
															   CI.Paired->eraseFromParent();
														
 
															-  return Next;
														
 
															+  return New;
														
 
															 }
														
 
															 MachineBasicBlock::iterator
														
@@ -997,15 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
 
															   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
														
 
															   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
														
 
															-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
														
 
															-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
														
 
															-      .addImm(MergedOffset) // offset
														
 
															-      .addImm(CI.GLC0)      // glc
														
 
															-      .addImm(CI.SLC0)      // slc
														
 
															-      .addImm(0)            // tfe
														
 
															-      .addImm(CI.DLC0)      // dlc
														
 
															-      .addImm(0)            // swz
														
 
															-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															+  MachineInstr *New =
														
 
															+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
														
 
															+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
														
 
															+        .addImm(MergedOffset) // offset
														
 
															+        .addImm(CI.GLC0)      // glc
														
 
															+        .addImm(CI.SLC0)      // slc
														
 
															+        .addImm(0)            // tfe
														
 
															+        .addImm(CI.DLC0)      // dlc
														
 
															+        .addImm(0)            // swz
														
 
															+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															   std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
														
 
															   const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
														
@@ -1025,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
 
															   moveInstsAfter(Copy1, CI.InstsToMove);
														
 
															-  MachineBasicBlock::iterator Next = std::next(CI.I);
														
 
															   CI.I->eraseFromParent();
														
 
															   CI.Paired->eraseFromParent();
														
 
															-  return Next;
														
 
															+  return New;
														
 
															 }
														
 
															 unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
														
@@ -1191,22 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
 
															   const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
														
 
															   const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
														
 
															-  MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
														
 
															-      .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
														
 
															-      .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
														
 
															-      .addImm(CI.GLC0)      // glc
														
 
															-      .addImm(CI.SLC0)      // slc
														
 
															-      .addImm(0)            // tfe
														
 
															-      .addImm(CI.DLC0)      // dlc
														
 
															-      .addImm(0)            // swz
														
 
															-      .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															+  MachineInstr *New =
														
 
															+    MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
														
 
															+        .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
														
 
															+        .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
														
 
															+        .addImm(CI.GLC0)      // glc
														
 
															+        .addImm(CI.SLC0)      // slc
														
 
															+        .addImm(0)            // tfe
														
 
															+        .addImm(CI.DLC0)      // dlc
														
 
															+        .addImm(0)            // swz
														
 
															+        .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
														
 
															   moveInstsAfter(MIB, CI.InstsToMove);
														
 
															-  MachineBasicBlock::iterator Next = std::next(CI.I);
														
 
															   CI.I->eraseFromParent();
														
 
															   CI.Paired->eraseFromParent();
														
 
															-  return Next;
														
 
															+  return New;
														
 
															 }
														
 
															 MachineOperand
														
@@ -1519,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
 
															   return false;
														
 
															 }
														
 
															-// Scan through looking for adjacent LDS operations with constant offsets from
														
 
															-// the same base register. We rely on the scheduler to do the hard work of
														
 
															-// clustering nearby loads, and assume these are all adjacent.
														
 
															-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
														
 
															-  bool Modified = false;
														
 
															+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
														
 
															+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
														
 
															+  for (std::list<CombineInfo> &AddrList : MergeableInsts) {
														
 
															+    if (AddrList.front().hasSameBaseAddress(*CI.I) &&
														
 
															+        AddrList.front().InstClass == CI.InstClass) {
														
 
															+      AddrList.emplace_back(CI);
														
 
															+      return;
														
 
															+    }
														
 
															+  }
														
 
															+  // Base address not found, so add a new list.
														
 
															+  MergeableInsts.emplace_back(1, CI);
														
 
															+}
														
 
															+
														
 
															+bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
														
 
															+                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
														
 
															+  bool Modified = false;
														
 
															   // Contain the list
														
 
															   MemInfoMap Visited;
														
 
															   // Contains the list of instructions for which constant offsets are being
														
 
															   // promoted to the IMM.
														
 
															   SmallPtrSet<MachineInstr *, 4> AnchorList;
														
 
															-  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
														
 
															-    MachineInstr &MI = *I;
														
 
															-
														
 
															+  // Sort potential mergeable instructions into lists.  One list per base address.
														
 
															+  for (MachineInstr &MI : MBB.instrs()) {
														
 
															+    // We run this before checking if an address is mergeable, because it can produce
														
 
															+    // better code even if the instructions aren't mergeable.
														
 
															     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
														
 
															       Modified = true;
														
 
															+    const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
														
 
															+    if (InstClass == UNKNOWN)
														
 
															+      continue;
														
 
															+
														
 
															     // Don't combine if volatile.
														
 
															-    if (MI.hasOrderedMemoryRef()) {
														
 
															-      ++I;
														
 
															+    if (MI.hasOrderedMemoryRef())
														
 
															       continue;
														
 
															-    }
														
 
															     CombineInfo CI;
														
 
															-    CI.setMI(I, *TII, *STM);
														
 
															+    CI.setMI(MI, *TII, *STM);
														
 
															+
														
 
															+    if (!CI.hasMergeableAddress(*MRI))
														
 
															+      continue;
														
 
															+
														
 
															+    addInstToMergeableList(CI, MergeableInsts);
														
 
															+  }
														
 
															+  return Modified;
														
 
															+}
														
 
															+
														
 
															+// Scan through looking for adjacent LDS operations with constant offsets from
														
 
															+// the same base register. We rely on the scheduler to do the hard work of
														
 
															+// clustering nearby loads, and assume these are all adjacent.
														
 
															+bool SILoadStoreOptimizer::optimizeBlock(
														
 
															+                       std::list<std::list<CombineInfo> > &MergeableInsts) {
														
 
															+  bool Modified = false;
														
 
															+
														
 
															+  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
														
 
															+    if (MergeList.size() < 2)
														
 
															+      continue;
														
 
															+
														
 
															+    bool OptimizeListAgain = false;
														
 
															+    if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
														
 
															+      // We weren't able to make any changes, so clear the list so we don't
														
 
															+      // process the same instructions the next time we try to optimize this
														
 
															+      // block.
														
 
															+      MergeList.clear();
														
 
															+      continue;
														
 
															+    }
														
 
															+
														
 
															+    // We made changes, but also determined that there were no more optimization
														
 
															+    // opportunities, so we don't need to reprocess the list
														
 
															+    if (!OptimizeListAgain)
														
 
															+      MergeList.clear();
														
 
															+
														
 
															+    OptimizeAgain |= OptimizeListAgain;
														
 
															+    Modified = true;
														
 
															+  }
														
 
															+  return Modified;
														
 
															+}
														
 
															+
														
 
															+void
														
 
															+SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
														
 
															+                                         const MachineInstr &MI) {
														
 
															+
														
 
															+  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
														
 
															+    if (&*CI->I == &MI) {
														
 
															+      MergeList.erase(CI);
														
 
															+      return;
														
 
															+    }
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+bool
														
 
															+SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
														
 
															+                                          std::list<CombineInfo> &MergeList,
														
 
															+                                          bool &OptimizeListAgain) {
														
 
															+  bool Modified = false;
														
 
															+  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
														
 
															+    CombineInfo &CI = *I;
														
 
															     switch (CI.InstClass) {
														
 
															     default:
														
@@ -1552,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 
															     case DS_READ:
														
 
															       if (findMatchingInst(CI)) {
														
 
															         Modified = true;
														
 
															-        I = mergeRead2Pair(CI);
														
 
															-      } else {
														
 
															-        ++I;
														
 
															+        removeCombinedInst(MergeList, *CI.Paired);
														
 
															+        MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
														
 
															+        CI.setMI(NewMI, *TII, *STM);
														
 
															       }
														
 
															-      continue;
														
 
															+      break;
														
 
															     case DS_WRITE:
														
 
															       if (findMatchingInst(CI)) {
														
 
															         Modified = true;
														
 
															-        I = mergeWrite2Pair(CI);
														
 
															-      } else {
														
 
															-        ++I;
														
 
															+        removeCombinedInst(MergeList, *CI.Paired);
														
 
															+        MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
														
 
															+        CI.setMI(NewMI, *TII, *STM);
														
 
															       }
														
 
															-      continue;
														
 
															+      break;
														
 
															     case S_BUFFER_LOAD_IMM:
														
 
															       if (findMatchingInst(CI)) {
														
 
															         Modified = true;
														
 
															-        I = mergeSBufferLoadImmPair(CI);
														
 
															-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
														
 
															-      } else {
														
 
															-        ++I;
														
 
															+        removeCombinedInst(MergeList, *CI.Paired);
														
 
															+        MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
														
 
															+        CI.setMI(NewMI, *TII, *STM);
														
 
															+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
														
 
															       }
														
 
															-      continue;
														
 
															+      break;
														
 
															     case BUFFER_LOAD_OFFEN:
														
 
															     case BUFFER_LOAD_OFFSET:
														
 
															     case BUFFER_LOAD_OFFEN_exact:
														
 
															     case BUFFER_LOAD_OFFSET_exact:
														
 
															       if (findMatchingInst(CI)) {
														
 
															         Modified = true;
														
 
															-        I = mergeBufferLoadPair(CI);
														
 
															-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
														
 
															-      } else {
														
 
															-        ++I;
														
 
															+        removeCombinedInst(MergeList, *CI.Paired);
														
 
															+        MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
														
 
															+        CI.setMI(NewMI, *TII, *STM);
														
 
															+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
														
 
															       }
														
 
															-      continue;
														
 
															+      break;
														
 
															     case BUFFER_STORE_OFFEN:
														
 
															     case BUFFER_STORE_OFFSET:
														
 
															     case BUFFER_STORE_OFFEN_exact:
														
 
															     case BUFFER_STORE_OFFSET_exact:
														
 
															       if (findMatchingInst(CI)) {
														
 
															         Modified = true;
														
 
															-        I = mergeBufferStorePair(CI);
														
 
															-        OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
														
 
															-      } else {
														
 
															-        ++I;
														
 
															+        removeCombinedInst(MergeList, *CI.Paired);
														
 
															+        MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
														
 
															+        CI.setMI(NewMI, *TII, *STM);
														
 
															+        OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
														
 
															       }
														
 
															-      continue;
														
 
															+      break;
														
 
															     }
														
 
															-
														
 
															-    ++I;
														
 
															+    // Clear the InstsToMove after we have finished searching so we don't have
														
 
															+    // stale values left over if we search for this CI again in another pass
														
 
															+    // over the block.
														
 
															+    CI.InstsToMove.clear();
														
 
															   }
														
 
															   return Modified;
														
@@ -1626,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
															   bool Modified = false;
														
 
															+
														
 
															   for (MachineBasicBlock &MBB : MF) {
														
 
															+    std::list<std::list<CombineInfo> > MergeableInsts;
														
 
															+    // First pass: Collect list of all instructions we know how to merge.
														
 
															+    Modified |= collectMergeableInsts(MBB, MergeableInsts);
														
 
															     do {
														
 
															       OptimizeAgain = false;
														
 
															-      Modified |= optimizeBlock(MBB);
														
 
															+      Modified |= optimizeBlock(MergeableInsts);
														
 
															     } while (OptimizeAgain);
														
 
															   }