před 11 roky · 573931394f
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -292,6 +292,7 @@ namespace llvm {
 
				     bool isScheduleHigh   : 1;          // True if preferable to schedule high.
			
 
				     bool isScheduleLow    : 1;          // True if preferable to schedule low.
			
 
				     bool isCloned         : 1;          // True if this node has been cloned.
			
 
				+    bool isUnbuffered     : 1;          // Reads an unbuffered resource.
			
 
				     Sched::Preference SchedulingPref;   // Scheduling preference.
			
 
				 
			
 
				   private:
			
@@ -316,9 +317,10 @@ namespace llvm {
 
				         isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
			
 
				         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
			
 
				         isAvailable(false), isScheduled(false), isScheduleHigh(false),
			
 
				-        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
			
 
				-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
			
 
				-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				+        isScheduleLow(false), isCloned(false), isUnbuffered(false),
			
 
				+        SchedulingPref(Sched::None), isDepthCurrent(false),
			
 
				+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
			
 
				+        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				 
			
 
				     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
			
 
				     /// a MachineInstr.
			
@@ -330,9 +332,10 @@ namespace llvm {
 
				         isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
			
 
				         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
			
 
				         isAvailable(false), isScheduled(false), isScheduleHigh(false),
			
 
				-        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
			
 
				-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
			
 
				-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				+        isScheduleLow(false), isCloned(false), isUnbuffered(false),
			
 
				+        SchedulingPref(Sched::None), isDepthCurrent(false),
			
 
				+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
			
 
				+        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				 
			
 
				     /// SUnit - Construct a placeholder SUnit.
			
 
				     SUnit()
			
@@ -343,9 +346,10 @@ namespace llvm {
 
				         isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
			
 
				         hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
			
 
				         isAvailable(false), isScheduled(false), isScheduleHigh(false),
			
 
				-        isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
			
 
				-        isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
			
 
				-        TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				+        isScheduleLow(false), isCloned(false), isUnbuffered(false),
			
 
				+        SchedulingPref(Sched::None), isDepthCurrent(false),
			
 
				+        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
			
 
				+        BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
			
 
				 
			
 
				     /// \brief Boundary nodes are placeholders for the boundary of the
			
 
				     /// scheduling region.
			
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1330,7 +1330,7 @@ public:
 
				   /// Represent the type of SchedCandidate found within a single queue.
			
 
				   /// pickNodeBidirectional depends on these listed by decreasing priority.
			
 
				   enum CandReason {
			
 
				-    NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
			
 
				+    NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
			
 
				     ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
			
 
				     TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
			
 
				 
			
@@ -1583,6 +1583,10 @@ public:
 
				                       MaxExecutedResCount);
			
 
				     }
			
 
				 
			
 
				+    /// Get the difference between the given SUnit's ready time and the current
			
 
				+    /// cycle.
			
 
				+    unsigned getLatencyStallCycles(SUnit *SU);
			
 
				+
			
 
				     bool checkHazard(SUnit *SU);
			
 
				 
			
 
				     unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
			
@@ -1869,6 +1873,23 @@ void GenericScheduler::registerRoots() {
 
				   }
			
 
				 }
			
 
				 
			
 
				+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
			
 
				+/// these "soft stalls" differently than the hard stall cycles based on CPU
			
 
				+/// resources and computed by checkHazard(). A fully in-order model
			
 
				+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
			
 
				+/// available for scheduling until they are ready. However, a weaker in-order
			
 
				+/// model may use this for heuristics. For example, if a processor has in-order
			
 
				+/// behavior when reading certain resources, this may come into play.
			
 
				+unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) {
			
 
				+  if (!SU->isUnbuffered)
			
 
				+    return 0;
			
 
				+
			
 
				+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
			
 
				+  if (ReadyCycle > CurrCycle)
			
 
				+    return ReadyCycle - CurrCycle;
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
 
				 /// Does this SU have a hazard within the current instruction group.
			
 
				 ///
			
 
				 /// The scheduler supports two modes of hazard recognition. The first is the
			
@@ -1948,9 +1969,9 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
 
				 /// inside and outside the zone.
			
 
				 void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
			
 
				                                                    SchedBoundary &OtherZone) {
			
 
				-  // Now that potential stalls have been considered, apply preemptive heuristics
			
 
				-  // based on the the total latency and resources inside and outside this
			
 
				-  // zone.
			
 
				+  // Apply preemptive heuristics based on the the total latency and resources
			
 
				+  // inside and outside this zone. Potential stalls should be considered before
			
 
				+  // following this policy.
			
 
				 
			
 
				   // Compute remaining latency. We need this both to determine whether the
			
 
				   // overall schedule has become latency-limited and whether the instructions
			
@@ -2141,7 +2162,11 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
 
				     break;
			
 
				   default:
			
 
				     // We don't currently model the OOO reorder buffer, so consider all
			
 
				-    // scheduled MOps to be "retired".
			
 
				+    // scheduled MOps to be "retired". We do loosely model in-order resource
			
 
				+    // latency. If this instruction uses an in-order resource, account for any
			
 
				+    // likely stall cycles.
			
 
				+    if (SU->isUnbuffered && ReadyCycle > NextCycle)
			
 
				+      NextCycle = ReadyCycle;
			
 
				     break;
			
 
				   }
			
 
				   RetiredMOps += IncMOps;
			
@@ -2514,6 +2539,11 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
 
				       && tryLatency(TryCand, Cand, Zone))
			
 
				     return;
			
 
				 
			
 
				+  // Prioritize instructions that read unbuffered resources by stall cycles.
			
 
				+  if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
			
 
				+              Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
			
 
				+    return;
			
 
				+
			
 
				   // Keep clustered nodes together to encourage downstream peephole
			
 
				   // optimizations which may reduce resource requirements.
			
 
				   //
			
@@ -2577,6 +2607,7 @@ const char *GenericScheduler::getReasonStr(
 
				   case PhysRegCopy:    return "PREG-COPY";
			
 
				   case RegExcess:      return "REG-EXCESS";
			
 
				   case RegCritical:    return "REG-CRIT  ";
			
 
				+  case Stall:          return "STALL     ";
			
 
				   case Cluster:        return "CLUSTER   ";
			
 
				   case Weak:           return "WEAK      ";
			
 
				   case RegMax:         return "REG-MAX   ";
			
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -687,6 +687,22 @@ void ScheduleDAGInstrs::initSUnits() {
 
				 
			
 
				     // Assign the Latency field of SU using target-provided information.
			
 
				     SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
			
 
				+
			
 
				+    // If this SUnit uses an unbuffered resource, mark it as such.
			
 
				+    // These resources are used for in-order execution pipelines within an
			
 
				+    // out-of-order core and are identified by BufferSize=1. BufferSize=0 is
			
 
				+    // used for dispatch/issue groups and is not considered here.
			
 
				+    if (SchedModel.hasInstrSchedModel()) {
			
 
				+      const MCSchedClassDesc *SC = getSchedClass(SU);
			
 
				+      for (TargetSchedModel::ProcResIter
			
 
				+             PI = SchedModel.getWriteProcResBegin(SC),
			
 
				+             PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
			
 
				+        if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) {
			
 
				+          SU->isUnbuffered = true;
			
 
				+          break;
			
 
				+        }
			
 
				+      }
			
 
				+    }
			
 
				   }
			
 
				 }
			
 
				 
			
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1905,7 +1905,7 @@ def A9UnitALU : ProcResource<2>;
 
				 def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
			
 
				 def A9UnitAGU : ProcResource<1>;
			
 
				 def A9UnitLS  : ProcResource<1>;
			
 
				-def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
			
 
				+def A9UnitFP  : ProcResource<1> { let BufferSize = 1; }
			
 
				 def A9UnitB   : ProcResource<1>;
			
 
				 
			
 
				 //===----------------------------------------------------------------------===//
			
--- a/test/CodeGen/ARM/saxpy10-a9.ll
+++ b/test/CodeGen/ARM/saxpy10-a9.ll
@@ -0,0 +1,135 @@
 
				+; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false | FileCheck %s
			
 
				+;
			
 
				+; Test MI-Sched suppory latency based stalls on in in-order pipeline
			
 
				+; using the new machine model.
			
 
				+
			
 
				+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
			
 
				+
			
 
				+; Don't be too strict with the top of the schedule, but most of it
			
 
				+; should be nicely pipelined.
			
 
				+;
			
 
				+; CHECK: saxpy10:
			
 
				+; CHECK: vldr
			
 
				+; CHECK: vldr
			
 
				+; CHECK: vldr
			
 
				+; CHECK: vldr
			
 
				+; CHECK: vldr
			
 
				+; CHECK: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vmul
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vldr
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vadd
			
 
				+; CHECK-NEXT: vmov
			
 
				+; CHECK-NEXT: bx
			
 
				+;
			
 
				+; This accumulates a sum rather than storing each result.
			
 
				+define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) {
			
 
				+entry:
			
 
				+  %0 = load float* %data1, align 4
			
 
				+  %mul = fmul float %0, %a
			
 
				+  %1 = load float* %data2, align 4
			
 
				+  %add = fadd float %mul, %1
			
 
				+  %add2 = fadd float %add, 0.000000e+00
			
 
				+  %arrayidx.1 = getelementptr inbounds float* %data1, i32 1
			
 
				+  %2 = load float* %arrayidx.1, align 4
			
 
				+  %mul.1 = fmul float %2, %a
			
 
				+  %arrayidx1.1 = getelementptr inbounds float* %data2, i32 1
			
 
				+  %3 = load float* %arrayidx1.1, align 4
			
 
				+  %add.1 = fadd float %mul.1, %3
			
 
				+  %add2.1 = fadd float %add2, %add.1
			
 
				+  %arrayidx.2 = getelementptr inbounds float* %data1, i32 2
			
 
				+  %4 = load float* %arrayidx.2, align 4
			
 
				+  %mul.2 = fmul float %4, %a
			
 
				+  %arrayidx1.2 = getelementptr inbounds float* %data2, i32 2
			
 
				+  %5 = load float* %arrayidx1.2, align 4
			
 
				+  %add.2 = fadd float %mul.2, %5
			
 
				+  %add2.2 = fadd float %add2.1, %add.2
			
 
				+  %arrayidx.3 = getelementptr inbounds float* %data1, i32 3
			
 
				+  %6 = load float* %arrayidx.3, align 4
			
 
				+  %mul.3 = fmul float %6, %a
			
 
				+  %arrayidx1.3 = getelementptr inbounds float* %data2, i32 3
			
 
				+  %7 = load float* %arrayidx1.3, align 4
			
 
				+  %add.3 = fadd float %mul.3, %7
			
 
				+  %add2.3 = fadd float %add2.2, %add.3
			
 
				+  %arrayidx.4 = getelementptr inbounds float* %data1, i32 4
			
 
				+  %8 = load float* %arrayidx.4, align 4
			
 
				+  %mul.4 = fmul float %8, %a
			
 
				+  %arrayidx1.4 = getelementptr inbounds float* %data2, i32 4
			
 
				+  %9 = load float* %arrayidx1.4, align 4
			
 
				+  %add.4 = fadd float %mul.4, %9
			
 
				+  %add2.4 = fadd float %add2.3, %add.4
			
 
				+  %arrayidx.5 = getelementptr inbounds float* %data1, i32 5
			
 
				+  %10 = load float* %arrayidx.5, align 4
			
 
				+  %mul.5 = fmul float %10, %a
			
 
				+  %arrayidx1.5 = getelementptr inbounds float* %data2, i32 5
			
 
				+  %11 = load float* %arrayidx1.5, align 4
			
 
				+  %add.5 = fadd float %mul.5, %11
			
 
				+  %add2.5 = fadd float %add2.4, %add.5
			
 
				+  %arrayidx.6 = getelementptr inbounds float* %data1, i32 6
			
 
				+  %12 = load float* %arrayidx.6, align 4
			
 
				+  %mul.6 = fmul float %12, %a
			
 
				+  %arrayidx1.6 = getelementptr inbounds float* %data2, i32 6
			
 
				+  %13 = load float* %arrayidx1.6, align 4
			
 
				+  %add.6 = fadd float %mul.6, %13
			
 
				+  %add2.6 = fadd float %add2.5, %add.6
			
 
				+  %arrayidx.7 = getelementptr inbounds float* %data1, i32 7
			
 
				+  %14 = load float* %arrayidx.7, align 4
			
 
				+  %mul.7 = fmul float %14, %a
			
 
				+  %arrayidx1.7 = getelementptr inbounds float* %data2, i32 7
			
 
				+  %15 = load float* %arrayidx1.7, align 4
			
 
				+  %add.7 = fadd float %mul.7, %15
			
 
				+  %add2.7 = fadd float %add2.6, %add.7
			
 
				+  %arrayidx.8 = getelementptr inbounds float* %data1, i32 8
			
 
				+  %16 = load float* %arrayidx.8, align 4
			
 
				+  %mul.8 = fmul float %16, %a
			
 
				+  %arrayidx1.8 = getelementptr inbounds float* %data2, i32 8
			
 
				+  %17 = load float* %arrayidx1.8, align 4
			
 
				+  %add.8 = fadd float %mul.8, %17
			
 
				+  %add2.8 = fadd float %add2.7, %add.8
			
 
				+  %arrayidx.9 = getelementptr inbounds float* %data1, i32 9
			
 
				+  %18 = load float* %arrayidx.9, align 4
			
 
				+  %mul.9 = fmul float %18, %a
			
 
				+  %arrayidx1.9 = getelementptr inbounds float* %data2, i32 9
			
 
				+  %19 = load float* %arrayidx1.9, align 4
			
 
				+  %add.9 = fadd float %mul.9, %19
			
 
				+  %add2.9 = fadd float %add2.8, %add.9
			
 
				+  ret float %add2.9
			
 
				+}