Browse Source

[MBP] Move a latch block with conditional exit and multi predecessors to top of loop

Current findBestLoopTop can find and move one kind of block to top, a latch block has one successor. Another common case is:

    * a latch block
    * it has two successors, one is loop header, another is exit
    * it has more than one predecessors

If it is below one of its predecessors P, only P can fall through to it, all other predecessors need a jump to it, and another conditional jump to loop header. If it is moved before loop header, all its predecessors jump to it, then fall through to loop header. So all its predecessors except P can reduce one taken branch.

Differential Revision: https://reviews.llvm.org/D43256




git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@363471 91177308-0d34-0410-b5e6-96231b3b80d8
Guozhi Wei 6 năm trước cách đây
mục cha
commit
7eae8125c6
70 tập tin đã thay đổi với 1357 bổ sung816 xóa
  1. 233 50
      lib/CodeGen/MachineBlockPlacement.cpp
  2. 1 1
      test/CodeGen/AArch64/cmpxchg-idioms.ll
  3. 1 1
      test/CodeGen/AArch64/neg-imm.ll
  4. 2 3
      test/CodeGen/AArch64/tailmerging_in_mbp.ll
  5. 5 4
      test/CodeGen/AMDGPU/collapse-endcf.ll
  6. 21 20
      test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll
  7. 11 10
      test/CodeGen/AMDGPU/global_smrd_cfg.ll
  8. 1 1
      test/CodeGen/AMDGPU/hoist-cond.ll
  9. 6 6
      test/CodeGen/AMDGPU/i1-copy-from-loop.ll
  10. 6 6
      test/CodeGen/AMDGPU/indirect-addressing-si.ll
  11. 1 1
      test/CodeGen/AMDGPU/loop_break.ll
  12. 2 2
      test/CodeGen/AMDGPU/loop_exit_with_xor.ll
  13. 1 1
      test/CodeGen/AMDGPU/madmk.ll
  14. 28 28
      test/CodeGen/AMDGPU/multilevel-break.ll
  15. 4 4
      test/CodeGen/AMDGPU/optimize-negated-cond.ll
  16. 6 6
      test/CodeGen/AMDGPU/si-annotate-cf.ll
  17. 1 1
      test/CodeGen/AMDGPU/valu-i1.ll
  18. 7 4
      test/CodeGen/AMDGPU/wqm.ll
  19. 1 1
      test/CodeGen/ARM/2011-03-23-PeepholeBug.ll
  20. 2 3
      test/CodeGen/ARM/arm-and-tst-peephole.ll
  21. 1 1
      test/CodeGen/ARM/atomic-cmp.ll
  22. 13 13
      test/CodeGen/ARM/atomic-cmpxchg.ll
  23. 2 2
      test/CodeGen/ARM/code-placement.ll
  24. 1 1
      test/CodeGen/ARM/pr32578.ll
  25. 1 1
      test/CodeGen/ARM/swifterror.ll
  26. 1 1
      test/CodeGen/Hexagon/bug6757-endloop.ll
  27. 3 1
      test/CodeGen/Hexagon/early-if-merge-loop.ll
  28. 1 1
      test/CodeGen/Hexagon/prof-early-if.ll
  29. 1 1
      test/CodeGen/Hexagon/redundant-branching2.ll
  30. 144 168
      test/CodeGen/PowerPC/atomics-regression.ll
  31. 6 5
      test/CodeGen/PowerPC/cmp_elimination.ll
  32. 2 1
      test/CodeGen/PowerPC/ctrloop-shortLoops.ll
  33. 5 5
      test/CodeGen/PowerPC/expand-foldable-isel.ll
  34. 1 1
      test/CodeGen/PowerPC/knowCRBitSpill.ll
  35. 1 2
      test/CodeGen/PowerPC/licm-remat.ll
  36. 3 3
      test/CodeGen/SystemZ/atomicrmw-minmax-01.ll
  37. 3 3
      test/CodeGen/SystemZ/atomicrmw-minmax-02.ll
  38. 2 2
      test/CodeGen/SystemZ/loop-01.ll
  39. 1 1
      test/CodeGen/SystemZ/loop-02.ll
  40. 2 2
      test/CodeGen/SystemZ/swifterror.ll
  41. 6 6
      test/CodeGen/Thumb/consthoist-physical-addr.ll
  42. 9 10
      test/CodeGen/X86/block-placement.ll
  43. 5 2
      test/CodeGen/X86/code_placement.ll
  44. 1 1
      test/CodeGen/X86/code_placement_cold_loop_blocks.ll
  45. 3 4
      test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll
  46. 7 7
      test/CodeGen/X86/code_placement_loop_rotation2.ll
  47. 1 1
      test/CodeGen/X86/code_placement_no_header_change.ll
  48. 89 89
      test/CodeGen/X86/conditional-tailcall.ll
  49. 22 16
      test/CodeGen/X86/loop-blocks.ll
  50. 120 0
      test/CodeGen/X86/loop-rotate.ll
  51. 33 33
      test/CodeGen/X86/lsr-loop-exit-cond.ll
  52. 239 0
      test/CodeGen/X86/move_latch_to_loop_top.ll
  53. 8 8
      test/CodeGen/X86/pr38185.ll
  54. 63 60
      test/CodeGen/X86/ragreedy-hoist-spill.ll
  55. 18 17
      test/CodeGen/X86/reverse_branches.ll
  56. 30 27
      test/CodeGen/X86/speculative-load-hardening.ll
  57. 3 3
      test/CodeGen/X86/swifterror.ll
  58. 35 32
      test/CodeGen/X86/tail-dup-merge-loop-headers.ll
  59. 10 15
      test/CodeGen/X86/tail-dup-repeat.ll
  60. 35 32
      test/CodeGen/X86/vector-shift-by-select-loop.ll
  61. 8 8
      test/CodeGen/X86/widen_arith-1.ll
  62. 8 8
      test/CodeGen/X86/widen_arith-2.ll
  63. 8 8
      test/CodeGen/X86/widen_arith-3.ll
  64. 16 16
      test/CodeGen/X86/widen_arith-4.ll
  65. 8 8
      test/CodeGen/X86/widen_arith-5.ll
  66. 8 8
      test/CodeGen/X86/widen_arith-6.ll
  67. 16 16
      test/CodeGen/X86/widen_cast-4.ll
  68. 1 1
      test/CodeGen/X86/x86-cmov-converter.ll
  69. 6 6
      test/DebugInfo/X86/PR37234.ll
  70. 7 6
      test/DebugInfo/X86/dbg-value-transfer-order.ll

+ 233 - 50
lib/CodeGen/MachineBlockPlacement.cpp

@@ -455,15 +455,24 @@ class MachineBlockPlacement : public MachineFunctionPass {
                                const MachineBasicBlock *OldTop);
   bool hasViableTopFallthrough(const MachineBasicBlock *Top,
                                const BlockFilterSet &LoopBlockSet);
+  BlockFrequency TopFallThroughFreq(const MachineBasicBlock *Top,
+                                    const BlockFilterSet &LoopBlockSet);
+  BlockFrequency FallThroughGains(const MachineBasicBlock *NewTop,
+                                  const MachineBasicBlock *OldTop,
+                                  const MachineBasicBlock *ExitBB,
+                                  const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopTop(
       const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
+      BlockFrequency &ExitFreq);
   BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
   void buildLoopChains(const MachineLoop &L);
   void rotateLoop(
       BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
-      const BlockFilterSet &LoopBlockSet);
+      BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
   void rotateLoopWithProfile(
       BlockChain &LoopChain, const MachineLoop &L,
       const BlockFilterSet &LoopBlockSet);
@@ -1790,66 +1799,205 @@ MachineBlockPlacement::canMoveBottomBlockToTop(
   return true;
 }
 
-/// Find the best loop top block for layout.
+// Find out the possible fall through frequence to the top of a loop.
+BlockFrequency
+MachineBlockPlacement::TopFallThroughFreq(
+    const MachineBasicBlock *Top,
+    const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency MaxFreq = 0;
+  for (MachineBasicBlock *Pred : Top->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      // Found a Pred block can be placed before Top.
+      // Check if Top is the best successor of Pred.
+      auto TopProb = MBPI->getEdgeProbability(Pred, Top);
+      bool TopOK = true;
+      for (MachineBasicBlock *Succ : Pred->successors()) {
+        auto SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+        BlockChain *SuccChain = BlockToChain[Succ];
+        // Check if Succ can be placed after Pred.
+        // Succ should not be in any chain, or it is the head of some chain.
+        if (!LoopBlockSet.count(Succ) && (SuccProb > TopProb) &&
+            (!SuccChain || Succ == *SuccChain->begin())) {
+          TopOK = false;
+          break;
+        }
+      }
+      if (TopOK) {
+        BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
+                                  MBPI->getEdgeProbability(Pred, Top);
+        if (EdgeFreq > MaxFreq)
+          MaxFreq = EdgeFreq;
+      }
+    }
+  }
+  return MaxFreq;
+}
+
+// Compute the fall through gains when move NewTop before OldTop.
+//
+// In following diagram, edges marked as "-" are reduced fallthrough, edges
+// marked as "+" are increased fallthrough, this function computes
+//
+//      SUM(increased fallthrough) - SUM(decreased fallthrough)
+//
+//              |
+//              | -
+//              V
+//        --->OldTop
+//        |     .
+//        |     .
+//       +|     .    +
+//        |   Pred --->
+//        |     |-
+//        |     V
+//        --- NewTop <---
+//              |-
+//              V
+//
+BlockFrequency
+MachineBlockPlacement::FallThroughGains(
+    const MachineBasicBlock *NewTop,
+    const MachineBasicBlock *OldTop,
+    const MachineBasicBlock *ExitBB,
+    const BlockFilterSet &LoopBlockSet) {
+  BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
+  BlockFrequency FallThrough2Exit = 0;
+  if (ExitBB)
+    FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
+        MBPI->getEdgeProbability(NewTop, ExitBB);
+  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
+      MBPI->getEdgeProbability(NewTop, OldTop);
+
+  // Find the best Pred of NewTop.
+   MachineBasicBlock *BestPred = nullptr;
+   BlockFrequency FallThroughFromPred = 0;
+   for (MachineBasicBlock *Pred : NewTop->predecessors()) {
+     if (!LoopBlockSet.count(Pred))
+       continue;
+     BlockChain *PredChain = BlockToChain[Pred];
+     if (!PredChain || Pred == *std::prev(PredChain->end())) {
+       BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
+           MBPI->getEdgeProbability(Pred, NewTop);
+       if (EdgeFreq > FallThroughFromPred) {
+         FallThroughFromPred = EdgeFreq;
+         BestPred = Pred;
+       }
+     }
+   }
+
+   // If NewTop is not placed after Pred, another successor can be placed
+   // after Pred.
+   BlockFrequency NewFreq = 0;
+   if (BestPred) {
+     for (MachineBasicBlock *Succ : BestPred->successors()) {
+       if ((Succ == NewTop) || (Succ == BestPred) || !LoopBlockSet.count(Succ))
+         continue;
+       if (ComputedEdges.find(Succ) != ComputedEdges.end())
+         continue;
+       BlockChain *SuccChain = BlockToChain[Succ];
+       if ((SuccChain && (Succ != *SuccChain->begin())) ||
+           (SuccChain == BlockToChain[BestPred]))
+         continue;
+       BlockFrequency EdgeFreq = MBFI->getBlockFreq(BestPred) *
+           MBPI->getEdgeProbability(BestPred, Succ);
+       if (EdgeFreq > NewFreq)
+         NewFreq = EdgeFreq;
+     }
+     BlockFrequency OrigEdgeFreq = MBFI->getBlockFreq(BestPred) *
+         MBPI->getEdgeProbability(BestPred, NewTop);
+     if (NewFreq > OrigEdgeFreq) {
+       // If NewTop is not the best successor of Pred, then Pred doesn't
+       // fallthrough to NewTop. So there is no FallThroughFromPred and
+       // NewFreq.
+       NewFreq = 0;
+       FallThroughFromPred = 0;
+     }
+   }
+
+   BlockFrequency Result = 0;
+   BlockFrequency Gains = BackEdgeFreq + NewFreq;
+   BlockFrequency Lost = FallThrough2Top + FallThrough2Exit +
+       FallThroughFromPred;
+   if (Gains > Lost)
+     Result = Gains - Lost;
+   return Result;
+}
+
+/// Helper function of findBestLoopTop. Find the best loop top block
+/// from predecessors of old top.
 ///
-/// Look for a block which is strictly better than the loop header for laying
-/// out at the top of the loop. This looks for one and only one pattern:
-/// a latch block with no conditional exit. This block will cause a conditional
-/// jump around it or will be the bottom of the loop if we lay it out in place,
-/// but if it it doesn't end up at the bottom of the loop for any reason,
-/// rotation alone won't fix it. Because such a block will always result in an
-/// unconditional jump (for the backedge) rotating it in front of the loop
-/// header is always profitable.
+/// Look for a block which is strictly better than the old top for laying
+/// out before the old top of the loop. This looks for only two patterns:
+///
+///     1. a block has only one successor, the old loop top
+///
+///        Because such a block will always result in an unconditional jump,
+///        rotating it in front of the old top is always profitable.
+///
+///     2. a block has two successors, one is old top, another is exit
+///        and it has more than one predecessors
+///
+///        If it is below one of its predecessors P, only P can fall through to
+///        it, all other predecessors need a jump to it, and another conditional
+///        jump to loop header. If it is moved before loop header, all its
+///        predecessors jump to it, then fall through to loop header. So all its
+///        predecessors except P can reduce one taken branch.
+///        At the same time, move it before old top increases the taken branch
+///        to loop exit block, so the reduced taken branch will be compared with
+///        the increased taken branch to the loop exit block.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
-                                       const BlockFilterSet &LoopBlockSet) {
-  // Placing the latch block before the header may introduce an extra branch
-  // that skips this block the first time the loop is executed, which we want
-  // to avoid when optimising for size.
-  // FIXME: in theory there is a case that does not introduce a new branch,
-  // i.e. when the layout predecessor does not fallthrough to the loop header.
-  // In practice this never happens though: there always seems to be a preheader
-  // that can fallthrough and that is also placed before the header.
-  if (F->getFunction().hasOptSize())
-    return L.getHeader();
-
+MachineBlockPlacement::findBestLoopTopHelper(
+    MachineBasicBlock *OldTop,
+    const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
   // Check that the header hasn't been fused with a preheader block due to
   // crazy branches. If it has, we need to start with the header at the top to
   // prevent pulling the preheader into the loop body.
-  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  BlockChain &HeaderChain = *BlockToChain[OldTop];
   if (!LoopBlockSet.count(*HeaderChain.begin()))
-    return L.getHeader();
+    return OldTop;
 
-  LLVM_DEBUG(dbgs() << "Finding best loop top for: "
-                    << getBlockName(L.getHeader()) << "\n");
+  LLVM_DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(OldTop)
+                    << "\n");
 
-  BlockFrequency BestPredFreq;
+  BlockFrequency BestGains = 0;
   MachineBasicBlock *BestPred = nullptr;
-  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
+  for (MachineBasicBlock *Pred : OldTop->predecessors()) {
     if (!LoopBlockSet.count(Pred))
       continue;
-    LLVM_DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", has "
+    if (Pred == L.getHeader())
+      continue;
+    LLVM_DEBUG(dbgs() << "   old top pred: " << getBlockName(Pred) << ", has "
                       << Pred->succ_size() << " successors, ";
                MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
-    if (Pred->succ_size() > 1)
+    if (Pred->succ_size() > 2)
       continue;
 
-    if (!canMoveBottomBlockToTop(Pred, L.getHeader()))
+    MachineBasicBlock *OtherBB = nullptr;
+    if (Pred->succ_size() == 2) {
+      OtherBB = *Pred->succ_begin();
+      if (OtherBB == OldTop)
+        OtherBB = *Pred->succ_rbegin();
+    }
+
+    if (!canMoveBottomBlockToTop(Pred, OldTop))
       continue;
 
-    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
-    if (!BestPred || PredFreq > BestPredFreq ||
-        (!(PredFreq < BestPredFreq) &&
-         Pred->isLayoutSuccessor(L.getHeader()))) {
+    BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
+                                            LoopBlockSet);
+    if ((Gains > 0) && (Gains > BestGains ||
+        ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
       BestPred = Pred;
-      BestPredFreq = PredFreq;
+      BestGains = Gains;
     }
   }
 
   // If no direct predecessor is fine, just use the loop header.
   if (!BestPred) {
     LLVM_DEBUG(dbgs() << "    final top unchanged\n");
-    return L.getHeader();
+    return OldTop;
   }
 
   // Walk backwards through any straight line of predecessors.
@@ -1862,6 +2010,34 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
   return BestPred;
 }
 
+/// Find the best loop top block for layout.
+///
+/// This function iteratively calls findBestLoopTopHelper, until no new better
+/// BB can be found.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  // Placing the latch block before the header may introduce an extra branch
+  // that skips this block the first time the loop is executed, which we want
+  // to avoid when optimising for size.
+  // FIXME: in theory there is a case that does not introduce a new branch,
+  // i.e. when the layout predecessor does not fallthrough to the loop header.
+  // In practice this never happens though: there always seems to be a preheader
+  // that can fallthrough and that is also placed before the header.
+  if (F->getFunction().hasOptSize())
+    return L.getHeader();
+
+  MachineBasicBlock *OldTop = nullptr;
+  MachineBasicBlock *NewTop = L.getHeader();
+  while (NewTop != OldTop) {
+    OldTop = NewTop;
+    NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
+    if (NewTop != OldTop)
+      ComputedEdges[NewTop] = { OldTop, false };
+  }
+  return NewTop;
+}
+
 /// Find the best loop exiting block for layout.
 ///
 /// This routine implements the logic to analyze the loop looking for the best
@@ -1869,7 +2045,8 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
 /// fallthrough opportunities.
 MachineBasicBlock *
 MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
-                                        const BlockFilterSet &LoopBlockSet) {
+                                        const BlockFilterSet &LoopBlockSet,
+                                        BlockFrequency &ExitFreq) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
   // within the loop is the best one to layout at the top. However, if the loop
@@ -1980,6 +2157,7 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
 
   LLVM_DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB)
                     << "\n");
+  ExitFreq = BestExitEdgeFreq;
   return ExitingBB;
 }
 
@@ -2024,6 +2202,7 @@ MachineBlockPlacement::hasViableTopFallthrough(
 /// of its bottom already, don't rotate it.
 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
                                        const MachineBasicBlock *ExitingBB,
+                                       BlockFrequency ExitFreq,
                                        const BlockFilterSet &LoopBlockSet) {
   if (!ExitingBB)
     return;
@@ -2047,6 +2226,12 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
           (!SuccChain || Succ == *SuccChain->begin()))
         return;
     }
+
+    // Rotate will destroy the top fallthrough, we need to ensure the new exit
+    // frequency is larger than top fallthrough.
+    BlockFrequency FallThrough2Top = TopFallThroughFreq(Top, LoopBlockSet);
+    if (FallThrough2Top >= ExitFreq)
+      return;
   }
 
   BlockChain::iterator ExitIt = llvm::find(LoopChain, ExitingBB);
@@ -2102,8 +2287,6 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
 void MachineBlockPlacement::rotateLoopWithProfile(
     BlockChain &LoopChain, const MachineLoop &L,
     const BlockFilterSet &LoopBlockSet) {
-  auto HeaderBB = L.getHeader();
-  auto HeaderIter = llvm::find(LoopChain, HeaderBB);
   auto RotationPos = LoopChain.end();
 
   BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
@@ -2123,12 +2306,13 @@ void MachineBlockPlacement::rotateLoopWithProfile(
   // chain head is not the loop header. As we only consider natural loops with
   // single header, this computation can be done only once.
   BlockFrequency HeaderFallThroughCost(0);
-  for (auto *Pred : HeaderBB->predecessors()) {
+  MachineBasicBlock *ChainHeaderBB = *LoopChain.begin();
+  for (auto *Pred : ChainHeaderBB->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (!LoopBlockSet.count(Pred) &&
         (!PredChain || Pred == *std::prev(PredChain->end()))) {
-      auto EdgeFreq =
-          MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
+      auto EdgeFreq = MBFI->getBlockFreq(Pred) *
+          MBPI->getEdgeProbability(Pred, ChainHeaderBB);
       auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
       // If the predecessor has only an unconditional jump to the header, we
       // need to consider the cost of this jump.
@@ -2178,7 +2362,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     // If the current BB is the loop header, we need to take into account the
     // cost of the missed fall through edge from outside of the loop to the
     // header.
-    if (Iter != HeaderIter)
+    if (Iter != LoopChain.begin())
       Cost += HeaderFallThroughCost;
 
     // Collect the loop exit cost by summing up frequencies of all exit edges
@@ -2299,9 +2483,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // loop. This will default to the header, but may end up as one of the
   // predecessors to the header if there is one which will result in strictly
   // fewer branches in the loop body.
-  // When we use profile data to rotate the loop, this is unnecessary.
-  MachineBasicBlock *LoopTop =
-      RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
+  MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet);
 
   // If we selected just the header for the loop top, look for a potentially
   // profitable exit block in the event that rotating the loop can eliminate
@@ -2310,8 +2492,9 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // Loops are processed innermost to uttermost, make sure we clear
   // PreferredLoopExit before processing a new loop.
   PreferredLoopExit = nullptr;
+  BlockFrequency ExitFreq;
   if (!RotateLoopWithProfile && LoopTop == L.getHeader())
-    PreferredLoopExit = findBestLoopExit(L, LoopBlockSet);
+    PreferredLoopExit = findBestLoopExit(L, LoopBlockSet, ExitFreq);
 
   BlockChain &LoopChain = *BlockToChain[LoopTop];
 
@@ -2331,7 +2514,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   if (RotateLoopWithProfile)
     rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
   else
-    rotateLoop(LoopChain, PreferredLoopExit, LoopBlockSet);
+    rotateLoop(LoopChain, PreferredLoopExit, ExitFreq, LoopBlockSet);
 
   LLVM_DEBUG({
     // Crash at the end so we get all of the debugging output first.

+ 1 - 1
test/CodeGen/AArch64/cmpxchg-idioms.ll

@@ -111,7 +111,7 @@ define i1 @test_conditional2(i32 %a, i32 %b, i32* %c) {
 ; CHECK: mov w22, #2
 ; CHECK-NOT: mov w22, #4
 ; CHECK-NOT: cmn w22, #4
-; CHECK: b [[LOOP2:LBB[0-9]+_[0-9]+]]
+; CHECK: [[LOOP2:LBB[0-9]+_[0-9]+]]: ; %for.cond
 ; CHECK-NOT: b.ne [[LOOP2]]
 ; CHECK-NOT: b {{LBB[0-9]+_[0-9]+}}
 ; CHECK: bl _foo

+ 1 - 1
test/CodeGen/AArch64/neg-imm.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -disable-block-placement -o - %s | FileCheck %s
 ; LSR used to pick a sub-optimal solution due to the target responding
 ; conservatively to isLegalAddImmediate for negative values.
 

+ 2 - 3
test/CodeGen/AArch64/tailmerging_in_mbp.ll

@@ -1,9 +1,8 @@
 ; RUN: llc <%s -mtriple=aarch64-eabi -verify-machine-dom-info | FileCheck %s
 
 ; CHECK-LABEL: test:
-; CHECK:       LBB0_7:
-; CHECK:         b.hi	
-; CHECK-NEXT:    b	
+; CHECK-LABEL: %cond.false12.i
+; CHECK:         b.gt	
 ; CHECK-NEXT:  LBB0_8:
 ; CHECK-NEXT:    mov	 x8, x9
 ; CHECK-NEXT:  LBB0_9:

+ 5 - 4
test/CodeGen/AMDGPU/collapse-endcf.ll

@@ -230,6 +230,11 @@ bb.end:                                           ; preds = %bb.then, %bb
 ; Make sure scc liveness is updated if sor_b64 is removed
 ; ALL-LABEL: {{^}}scc_liveness:
 
+; GCN: %bb10
+; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
+; GCN: s_andn2_b64
+; GCN-NEXT: s_cbranch_execz
+
 ; GCN: [[BB1_LOOP:BB[0-9]+_[0-9]+]]:
 ; GCN: s_andn2_b64 exec, exec,
 ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]]
@@ -239,10 +244,6 @@ bb.end:                                           ; preds = %bb.then, %bb
 
 ; GCN-NOT: s_or_b64 exec, exec
 
-; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
-; GCN: s_andn2_b64
-; GCN-NEXT: s_cbranch_execnz
-
 ; GCN: s_or_b64 exec, exec, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword

+ 21 - 20
test/CodeGen/AMDGPU/divergent-branch-uniform-condition.ll

@@ -19,38 +19,39 @@ define amdgpu_ps void @main(i32, float) {
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
 ; CHECK-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; CHECK-NEXT:  BB0_1: ; %loop
+; CHECK-NEXT:    s_branch BB0_3
+; CHECK-NEXT:  BB0_1: ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:  BB0_2: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
+; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
+; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
+; CHECK-NEXT:    s_mov_b64 s[4:5], s[8:9]
+; CHECK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_execz BB0_7
+; CHECK-NEXT:  BB0_3: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 32, v1
 ; CHECK-NEXT:    s_and_b64 vcc, exec, vcc
 ; CHECK-NEXT:    s_or_b64 s[6:7], s[6:7], exec
 ; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], exec
-; CHECK-NEXT:    s_cbranch_vccz BB0_5
-; CHECK-NEXT:  ; %bb.2: ; %endif1
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    s_cbranch_vccz BB0_1
+; CHECK-NEXT:  ; %bb.4: ; %endif1
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_mov_b64 s[6:7], -1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[0:1]
 ; CHECK-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; CHECK-NEXT:    ; mask branch BB0_4
-; CHECK-NEXT:  BB0_3: ; %endif2
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:    ; mask branch BB0_6
+; CHECK-NEXT:  BB0_5: ; %endif2
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
 ; CHECK-NEXT:    s_xor_b64 s[6:7], exec, -1
-; CHECK-NEXT:  BB0_4: ; %Flow1
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT:  BB0_6: ; %Flow1
+; CHECK-NEXT:    ; in Loop: Header=BB0_3 Depth=1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
-; CHECK-NEXT:    s_branch BB0_6
-; CHECK-NEXT:  BB0_5: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    ; implicit-def: $vgpr1
-; CHECK-NEXT:  BB0_6: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    s_and_b64 s[8:9], exec, s[6:7]
-; CHECK-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
-; CHECK-NEXT:    s_mov_b64 s[4:5], s[8:9]
-; CHECK-NEXT:    s_andn2_b64 exec, exec, s[8:9]
-; CHECK-NEXT:    s_cbranch_execnz BB0_1
-; CHECK-NEXT:  ; %bb.7: ; %Flow2
+; CHECK-NEXT:    s_branch BB0_2
+; CHECK-NEXT:  BB0_7: ; %Flow2
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; this is the divergent branch with the condition not marked as divergent

+ 11 - 10
test/CodeGen/AMDGPU/global_smrd_cfg.ll

@@ -1,27 +1,28 @@
 ; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs  < %s | FileCheck %s
 
-; CHECK-LABEL: %bb11
+; CHECK-LABEL: %bb22
 
-; Load from %arg in a Loop body has alias store
+; Load from %arg has alias store in Loop
 
 ; CHECK: flat_load_dword
 
-; CHECK-LABEL: %bb20
-; CHECK: flat_store_dword
+; #####################################################################
+
+; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i]
+
+; CHECK: s_load_dword
 
 ; #####################################################################
 
-; CHECK-LABEL: %bb22
+; CHECK-LABEL: %bb11
 
-; Load from %arg has alias store in Loop
+; Load from %arg in a Loop body has alias store
 
 ; CHECK: flat_load_dword
 
-; #####################################################################
-
-; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i]
+; CHECK-LABEL: %bb20
 
-; CHECK: s_load_dword
+; CHECK: flat_store_dword
 
 define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
 bb:

+ 1 - 1
test/CodeGen/AMDGPU/hoist-cond.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck %s
 
 ; Check that invariant compare is hoisted out of the loop.
 ; At the same time condition shall not be serialized into a VGPR and deserialized later

+ 6 - 6
test/CodeGen/AMDGPU/i1-copy-from-loop.ll

@@ -3,20 +3,20 @@
 
 ; SI-LABEL: {{^}}i1_copy_from_loop:
 ;
+; SI: ; %Flow
+; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
+; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], exec
+; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
+
 ; SI: ; %for.body
 ; SI:      v_cmp_gt_u32_e64  [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
-; SI-DAG:  s_andn2_b64       [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
+; SI-DAG:  s_andn2_b64       [[CC_ACCUM]], [[CC_ACCUM]], exec
 ; SI-DAG:  s_and_b64         [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
 ; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
 
 ; SI: ; %Flow1
 ; SI:      s_or_b64          [[CC_ACCUM]], [[CC_ACCUM]], exec
 
-; SI: ; %Flow
-; SI-DAG:  s_andn2_b64       [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
-; SI-DAG:  s_and_b64         [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
-; SI:      s_or_b64          [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]
-
 ; SI: ; %for.end
 ; SI:      s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]
 

+ 6 - 6
test/CodeGen/AMDGPU/indirect-addressing-si.ll

@@ -630,12 +630,7 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(
 ; GCN-LABEL: {{^}}broken_phi_bb:
 ; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8
 
-; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]
-
-; GCN: {{^BB[0-9]+_[0-9]+}}:
-; GCN: s_mov_b64 exec,
-
-; GCN: [[BB2]]:
+; GCN: [[BB2:BB[0-9]+_[0-9]+]]:
 ; GCN: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, [[PHIREG]]
 ; GCN: buffer_load_dword
 
@@ -647,6 +642,11 @@ define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: s_cbranch_execnz [[REGLOOP]]
+
+; GCN: {{^; %bb.[0-9]}}:
+; GCN: s_mov_b64 exec,
+; GCN: s_branch [[BB2]]
+
 define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2

+ 1 - 1
test/CodeGen/AMDGPU/loop_break.ll

@@ -1,5 +1,5 @@
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -disable-block-placement < %s | FileCheck -check-prefix=GCN %s
 
 ; Uses llvm.amdgcn.break
 

+ 2 - 2
test/CodeGen/AMDGPU/loop_exit_with_xor.ll

@@ -61,9 +61,9 @@ loopexit:
 
 ; GCN-LABEL: {{^}}break_cond_is_arg:
 ; GCN: s_xor_b64 [[REG1:[^ ,]*]], {{[^ ,]*, -1$}}
+; GCN: s_andn2_b64 exec, exec, [[REG3:[^ ,]*]]
 ; GCN: s_and_b64 [[REG2:[^ ,]*]], exec, [[REG1]]
-; GCN: s_or_b64 [[REG3:[^ ,]*]], [[REG2]],
-; GCN: s_andn2_b64 exec, exec, [[REG3]]
+; GCN: s_or_b64 [[REG3]], [[REG2]],
 
 define void @break_cond_is_arg(i32 %arg, i1 %breakcond) {
 entry:

+ 1 - 1
test/CodeGen/AMDGPU/madmk.ll

@@ -188,9 +188,9 @@ define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias
 }
 
 ; SI-LABEL: {{^}}kill_madmk_verifier_error:
+; SI: s_or_b64
 ; SI: s_xor_b64
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
-; SI: s_or_b64
 define amdgpu_kernel void @kill_madmk_verifier_error() nounwind {
 bb:
   br label %bb2

+ 28 - 28
test/CodeGen/AMDGPU/multilevel-break.ll

@@ -24,13 +24,29 @@
 ; GCN: ; %main_body
 ; GCN:      s_mov_b64           [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: [[FLOW2:BB[0-9]+_[0-9]+]]: ; %Flow2
+; GCN:      s_or_b64            exec, exec, [[TMP0:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
+; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
+; GCN:      s_cbranch_execz    [[IF_BLOCK:BB[0-9]+_[0-9]+]]
+
 ; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
 ; GCN:      s_mov_b64           [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: ; %Flow
+; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_and_b64           [[TMP0]], exec, [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
+; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
+; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
+; GCN:      s_cbranch_execz    [[FLOW2]]
+
 ; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
-; GCN:      s_or_b64            [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
-; GCN:      s_or_b64            [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
-; GCN:      s_and_saveexec_b64  [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN:      s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], exec
+; GCN:      s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], exec
+; GCN:      s_and_saveexec_b64  [[SAVE_EXEC]], vcc
 
 ; FIXME: duplicate comparison
 ; GCN: ; %ENDIF
@@ -43,23 +59,7 @@
 ; GCN-DAG:  s_or_b64            [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
 ; GCN-DAG:  s_or_b64            [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]
 
-; GCN: ; %Flow
-; GCN:      s_or_b64            exec, exec, [[SAVE_EXEC]]
-; GCN:      s_and_b64           [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
-; GCN:      s_or_b64            [[TMP0]], [[TMP0]], [[LEFT_INNER]]
-; GCN:      s_mov_b64           [[LEFT_INNER]], [[TMP0]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP0]]
-; GCN:      s_cbranch_execnz    [[INNER_LOOP]]
-
-; GCN: ; %Flow2
-; GCN:      s_or_b64            exec, exec, [[TMP0]]
-; GCN:      s_and_b64           [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
-; GCN:      s_or_b64            [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
-; GCN:      s_mov_b64           [[LEFT_OUTER]], [[TMP1]]
-; GCN:      s_andn2_b64         exec, exec, [[TMP1]]
-; GCN:      s_cbranch_execnz    [[OUTER_LOOP]]
-
-; GCN: ; %IF
+; GCN: [[IF_BLOCK]]: ; %IF
 ; GCN-NEXT: s_endpgm
 define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 main_body:
@@ -92,12 +92,18 @@ ENDIF:                                            ; preds = %LOOP
 ; GCN-LABEL: {{^}}multi_if_break_loop:
 ; GCN:      s_mov_b64          [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 
+; GCN: ; %Flow4
+; GCN:      s_and_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK]]
+; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]]
+; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
+; GCN-NEXT: s_cbranch_execz
+
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
-; GCN:      s_mov_b64          [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
+; GCN:      s_mov_b64          [[OLD_LEFT]], [[LEFT]]
 
 ; GCN: ; %LeafBlock1
 ; GCN:      s_mov_b64
-; GCN:      s_mov_b64          [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}
+; GCN:      s_mov_b64          [[BREAK]], -1{{$}}
 
 ; GCN: ; %case1
 ; GCN:      buffer_load_dword  [[LOAD2:v[0-9]+]],
@@ -118,12 +124,6 @@ ENDIF:                                            ; preds = %LOOP
 ; GCN-DAG:  s_and_b64          [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
 ; GCN:      s_or_b64           [[BREAK]], [[BREAK]], [[TMP]]
 
-; GCN: ; %Flow4
-; GCN:      s_and_b64          [[BREAK]], exec, [[BREAK]]
-; GCN:      s_or_b64           [[LEFT]], [[BREAK]], [[OLD_LEFT]]
-; GCN:      s_andn2_b64        exec, exec, [[LEFT]]
-; GCN-NEXT: s_cbranch_execnz
-
 define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()

+ 4 - 4
test/CodeGen/AMDGPU/optimize-negated-cond.ll

@@ -3,11 +3,11 @@
 ; GCN-LABEL: {{^}}negated_cond:
 ; GCN: BB0_1:
 ; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB0_2:
+; GCN: BB0_3:
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp
 ; GCN:   s_andn2_b64 vcc, exec, [[CC]]
-; GCN:   s_cbranch_vccnz BB0_4
+; GCN:   s_cbranch_vccnz BB0_2
 define amdgpu_kernel void @negated_cond(i32 addrspace(1)* %arg1) {
 bb:
   br label %bb1
@@ -36,11 +36,11 @@ bb4:
 
 ; GCN-LABEL: {{^}}negated_cond_dominated_blocks:
 ; GCN:   v_cmp_eq_u32_e64 [[CC:[^,]+]],
-; GCN: BB1_1:
+; GCN: %bb4
 ; GCN-NOT: v_cndmask_b32
 ; GCN-NOT: v_cmp
 ; GCN:   s_andn2_b64 vcc, exec, [[CC]]
-; GCN:   s_cbranch_vccz BB1_3
+; GCN:   s_cbranch_vccnz BB1_1
 define amdgpu_kernel void @negated_cond_dominated_blocks(i32 addrspace(1)* %arg1) {
 bb:
   br label %bb2

+ 6 - 6
test/CodeGen/AMDGPU/si-annotate-cf.ll

@@ -96,20 +96,20 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; FUNC-LABEL: {{^}}loop_land_info_assert:
 ; SI:      v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
 ; SI:      s_and_b64        [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
-; SI:      s_branch         [[INFLOOP:BB[0-9]+_[0-9]+]]
+
+; SI: [[WHILELOOP:BB[0-9]+_[0-9]+]]: ; %while.cond
+; SI:      s_cbranch_vccz [[FOR_COND_PH:BB[0-9]+_[0-9]+]]
 
 ; SI:      [[CONVEX_EXIT:BB[0-9_]+]]
 ; SI:      s_mov_b64        vcc,
 ; SI-NEXT: s_cbranch_vccnz  [[ENDPGM:BB[0-9]+_[0-9]+]]
-; SI:      s_cbranch_vccnz  [[INFLOOP]]
+
+; SI:      s_cbranch_vccnz  [[WHILELOOP]]
 
 ; SI: ; %if.else
 ; SI:      buffer_store_dword
 
-; SI:      [[INFLOOP]]:
-; SI:      s_cbranch_vccnz [[CONVEX_EXIT]]
-
-; SI: ; %for.cond.preheader
+; SI: [[FOR_COND_PH]]: ; %for.cond.preheader
 ; SI:      s_cbranch_vccz [[ENDPGM]]
 
 ; SI:      [[ENDPGM]]:

+ 1 - 1
test/CodeGen/AMDGPU/valu-i1.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 

+ 7 - 4
test/CodeGen/AMDGPU/wqm.ll

@@ -650,12 +650,15 @@ main_body:
 ; CHECK-DAG: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
 ; CHECK-DAG: s_mov_b32 [[SEVEN:s[0-9]+]], 0x40e00000
 
-; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
-; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %loop
 ; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
-; CHECK: s_cbranch_vccz [[LOOPHDR]]
-; CHECK: ; %break
+; CHECK: s_cbranch_vccnz
 
+; CHECK: ; %body
+; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
+; CHECK: s_branch [[LOOPHDR]]
+
+; CHECK: ; %break
 ; CHECK: ; return
 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
 entry:

+ 1 - 1
test/CodeGen/ARM/2011-03-23-PeepholeBug.ll

@@ -26,7 +26,7 @@ bb1:                                              ; preds = %bb
 
 bb2:                                              ; preds = %bb1, %entry
 ; CHECK: cmp [[REG]], #0
-; CHECK: ble
+; CHECK: bgt
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
   %tries.0 = sub i32 2147483647, %indvar
   %tmp1 = icmp sgt i32 %tries.0, 0

+ 2 - 3
test/CodeGen/ARM/arm-and-tst-peephole.ll

@@ -47,9 +47,8 @@ tailrecurse.switch:                               ; preds = %tailrecurse
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: beq
-; V8-NEXT: %sw.epilog
-; V8-NEXT: bx lr
+; V8-NEXT: bne
+; V8-NEXT: %sw.bb
   switch i32 %and, label %sw.epilog [
     i32 1, label %sw.bb
     i32 3, label %sw.bb6

+ 1 - 1
test/CodeGen/ARM/atomic-cmp.ll

@@ -9,8 +9,8 @@ define i8 @t(i8* %a, i8 %b, i8 %c) nounwind {
 ; ARM: clrex
 
 ; T2-LABEL: t:
-; T2: strexb
 ; T2: ldrexb
+; T2: strexb
 ; T2: clrex
   %tmp0 = cmpxchg i8* %a, i8 %b, i8 %c monotonic monotonic
   %tmp1 = extractvalue { i8, i1 } %tmp0, 0

+ 13 - 13
test/CodeGen/ARM/atomic-cmpxchg.ll

@@ -52,16 +52,16 @@ entry:
 ; CHECK-ARMV7-LABEL: test_cmpxchg_res_i8:
 ; CHECK-ARMV7-NEXT: .fnstart
 ; CHECK-ARMV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
-; CHECK-ARMV7-NEXT: b [[TRY:.LBB[0-9_]+]]
-; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
-; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
+; CHECK-ARMV7-NEXT: [[TRY:.LBB[0-9_]+]]:
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS:r[0-9]+]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
+; CHECK-ARMV7-NEXT: bne [[EXIT:.LBB[0-9_]+]]
+; CHECK-ARMV7-NEXT: strexb [[SUCCESS]], r2, [r0]
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
 ; CHECK-ARMV7-NEXT: moveq r0, #1
 ; CHECK-ARMV7-NEXT: bxeq lr
-; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
-; CHECK-ARMV7-NEXT: beq [[HEAD]]
+; CHECK-ARMV7-NEXT: b [[TRY]]
+; CHECK-ARMV7-NEXT: [[EXIT]]:
 ; CHECK-ARMV7-NEXT: mov r0, #0
 ; CHECK-ARMV7-NEXT: clrex
 ; CHECK-ARMV7-NEXT: bx lr
@@ -69,17 +69,17 @@ entry:
 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
 ; CHECK-THUMBV7-NEXT: .fnstart
 ; CHECK-THUMBV7-NEXT: uxtb [[DESIRED:r[0-9]+]], r1
-; CHECK-THUMBV7-NEXT: b [[TRYLD:.LBB[0-9_]+]]
-; CHECK-THUMBV7-NEXT: [[TRYST:.LBB[0-9_]+]]:
+; CHECK-THUMBV7-NEXT: [[TRYLD:.LBB[0-9_]+]]
+; CHECK-THUMBV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
+; CHECK-THUMBV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-THUMBV7-NEXT: bne [[EXIT:.LBB[0-9_]+]]
 ; CHECK-THUMBV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-THUMBV7-NEXT: cmp [[SUCCESS]], #0
 ; CHECK-THUMBV7-NEXT: itt eq
 ; CHECK-THUMBV7-NEXT: moveq r0, #1
 ; CHECK-THUMBV7-NEXT: bxeq lr
-; CHECK-THUMBV7-NEXT: [[TRYLD]]:
-; CHECK-THUMBV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-THUMBV7-NEXT: cmp [[LD]], [[DESIRED]]
-; CHECK-THUMBV7-NEXT: beq [[TRYST:.LBB[0-9_]+]]
+; CHECK-THUMBV7-NEXT: b [[TRYLD]]
+; CHECK-THUMBV7-NEXT: [[EXIT]]:
 ; CHECK-THUMBV7-NEXT: movs r0, #0
 ; CHECK-THUMBV7-NEXT: clrex
 ; CHECK-THUMBV7-NEXT: bx lr

+ 2 - 2
test/CodeGen/ARM/code-placement.ll

@@ -38,8 +38,9 @@ entry:
   br i1 %0, label %bb5, label %bb.nph15
 
 bb1:                                              ; preds = %bb2.preheader, %bb1
+; CHECK: LBB1_[[BB3:.]]: @ %bb3
 ; CHECK: LBB1_[[PREHDR:.]]: @ %bb2.preheader
-; CHECK: blt LBB1_[[BB3:.]]
+; CHECK: blt LBB1_[[BB3]]
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %bb2.preheader ] ; <i32> [#uses=2]
   %sum.08 = phi i32 [ %2, %bb1 ], [ %sum.110, %bb2.preheader ] ; <i32> [#uses=1]
   %tmp17 = sub i32 %i.07, %indvar                 ; <i32> [#uses=1]
@@ -53,7 +54,6 @@ bb1:                                              ; preds = %bb2.preheader, %bb1
 bb3:                                              ; preds = %bb1, %bb2.preheader
 ; CHECK: LBB1_[[BB1:.]]: @ %bb1
 ; CHECK: bne LBB1_[[BB1]]
-; CHECK: LBB1_[[BB3]]: @ %bb3
   %sum.0.lcssa = phi i32 [ %sum.110, %bb2.preheader ], [ %2, %bb1 ] ; <i32> [#uses=2]
   %3 = add i32 %pass.011, 1                       ; <i32> [#uses=2]
   %exitcond18 = icmp eq i32 %3, %passes           ; <i1> [#uses=1]

+ 1 - 1
test/CodeGen/ARM/pr32578.ll

@@ -4,7 +4,7 @@ target triple = "armv7"
 ; CHECK-LABEL: func:
 ; CHECK: push {r11, lr}
 ; CHECK: vpush {d8}
-; CHECK: b .LBB0_2
+; CHECK: .LBB0_1: @ %tailrecurse
 define arm_aapcscc double @func() {
   br label %tailrecurse
 

+ 1 - 1
test/CodeGen/ARM/swifterror.ll

@@ -182,7 +182,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: strb r{{.*}}, [r0, #8]
-; CHECK-APPLE: ble
+; CHECK-APPLE: b
 
 ; CHECK-O0-LABEL: foo_loop:
 ; CHECK-O0: cmp r{{.*}}, #0

+ 1 - 1
test/CodeGen/Hexagon/bug6757-endloop.ll

@@ -4,10 +4,10 @@
 ; This situation can arise due to tail duplication.
 
 ; CHECK: loop1([[LP:.LBB0_[0-9]+]]
+; CHECK: endloop1
 ; CHECK: [[LP]]:
 ; CHECK-NOT: loop1(
 ; CHECK: endloop1
-; CHECK: endloop1
 
 %s.0 = type { i32, i8* }
 %s.1 = type { i32, i32, i32, i32 }

+ 3 - 1
test/CodeGen/Hexagon/early-if-merge-loop.ll

@@ -2,9 +2,11 @@
 ; Make sure that the loop in the end has only one basic block.
 
 ; CHECK-LABEL: fred
+; CHECK: %b2
 ; Rely on the comments, make sure the one for the loop header is present.
 ; CHECK: %loop
-; CHECK-NOT: %should_merge
+; CHECK: %should_merge
+; CHECK: %exit
 
 target triple = "hexagon"
 

+ 1 - 1
test/CodeGen/Hexagon/prof-early-if.ll

@@ -1,8 +1,8 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
 ; Rely on the comments generated by llc. Check that "if.then" was not predicated.
+; CHECK: b5
 ; CHECK: b2
 ; CHECK-NOT: if{{.*}}memd
-; CHECK: b5
 
 %s.0 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %s.1], [5 x i32] }
 %s.1 = type { i32, i32 }

+ 1 - 1
test/CodeGen/Hexagon/redundant-branching2.ll

@@ -3,9 +3,9 @@
 
 ; CHECK: memub
 ; CHECK: memub
+; CHECK: cmp.eq
 ; CHECK: memub
 ; CHECK-NOT: if{{.*}}jump .LBB
-; CHECK: cmp.eq
 
 target triple = "hexagon-unknown--elf"
 

+ 144 - 168
test/CodeGen/PowerPC/atomics-regression.ll

@@ -401,16 +401,15 @@ define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test40:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
-; PPC64LE-NEXT:    b .LBB40_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB40_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB40_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB40_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB40_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB40_1
+; PPC64LE-NEXT:  .LBB40_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val monotonic monotonic
@@ -466,16 +465,15 @@ define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB43_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB43_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB43_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB43_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB43_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB43_1
+; PPC64LE-NEXT:  .LBB43_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release monotonic
@@ -487,16 +485,15 @@ define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB44_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB44_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB44_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB44_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB44_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB44_1
+; PPC64LE-NEXT:  .LBB44_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release acquire
@@ -622,16 +619,15 @@ define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test50:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
-; PPC64LE-NEXT:    b .LBB50_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB50_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB50_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB50_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB50_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB50_1
+; PPC64LE-NEXT:  .LBB50_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val monotonic monotonic
@@ -687,16 +683,15 @@ define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB53_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB53_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB53_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB53_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB53_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB53_1
+; PPC64LE-NEXT:  .LBB53_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release monotonic
@@ -708,16 +703,15 @@ define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB54_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB54_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB54_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB54_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB54_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB54_1
+; PPC64LE-NEXT:  .LBB54_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release acquire
@@ -842,16 +836,15 @@ define void @test59(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test60(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test60:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    b .LBB60_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB60_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB60_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB60_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB60_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB60_1
+; PPC64LE-NEXT:  .LBB60_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val monotonic monotonic
@@ -904,16 +897,15 @@ define void @test63(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test63:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB63_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB63_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB63_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB63_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB63_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB63_1
+; PPC64LE-NEXT:  .LBB63_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release monotonic
@@ -924,16 +916,15 @@ define void @test64(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test64:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB64_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB64_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB64_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB64_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB64_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB64_1
+; PPC64LE-NEXT:  .LBB64_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release acquire
@@ -1053,16 +1044,15 @@ define void @test69(i32* %ptr, i32 %cmp, i32 %val) {
 define void @test70(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test70:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    b .LBB70_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB70_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB70_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB70_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB70_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB70_1
+; PPC64LE-NEXT:  .LBB70_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val monotonic monotonic
@@ -1115,16 +1105,15 @@ define void @test73(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test73:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB73_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB73_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB73_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB73_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB73_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB73_1
+; PPC64LE-NEXT:  .LBB73_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release monotonic
@@ -1135,16 +1124,15 @@ define void @test74(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test74:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB74_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB74_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB74_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB74_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB74_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB74_1
+; PPC64LE-NEXT:  .LBB74_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release acquire
@@ -1265,16 +1253,15 @@ define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE-LABEL: test80:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
-; PPC64LE-NEXT:    b .LBB80_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB80_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB80_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB80_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB80_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB80_1
+; PPC64LE-NEXT:  .LBB80_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") monotonic monotonic
@@ -1330,16 +1317,15 @@ define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB83_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB83_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB83_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB83_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB83_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB83_1
+; PPC64LE-NEXT:  .LBB83_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release monotonic
@@ -1351,16 +1337,15 @@ define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 24, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB84_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB84_1:
-; PPC64LE-NEXT:    stbcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB84_2:
 ; PPC64LE-NEXT:    lbarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB84_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB84_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB84_1
+; PPC64LE-NEXT:  .LBB84_3:
 ; PPC64LE-NEXT:    stbcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val syncscope("singlethread") release acquire
@@ -1486,16 +1471,15 @@ define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE-LABEL: test90:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
-; PPC64LE-NEXT:    b .LBB90_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB90_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB90_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB90_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB90_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b 
+; PPC64LE-NEXT:  .LBB90_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") monotonic monotonic
@@ -1551,16 +1535,15 @@ define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB93_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB93_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB93_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB93_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB93_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB93_1
+; PPC64LE-NEXT:  .LBB93_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release monotonic
@@ -1572,16 +1555,15 @@ define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    rlwinm 4, 4, 0, 16, 31
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB94_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB94_1:
-; PPC64LE-NEXT:    sthcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB94_2:
 ; PPC64LE-NEXT:    lharx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB94_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB94_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB94_1
+; PPC64LE-NEXT:  .LBB94_3:
 ; PPC64LE-NEXT:    sthcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val syncscope("singlethread") release acquire
@@ -1706,16 +1688,15 @@ define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
 define void @test100(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test100:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    b .LBB100_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB100_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB100_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB100_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB100_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB100_1
+; PPC64LE-NEXT:  .LBB100_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") monotonic monotonic
@@ -1768,16 +1749,15 @@ define void @test103(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test103:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB103_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB103_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB103_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB103_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB103_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB103_1
+; PPC64LE-NEXT:  .LBB103_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release monotonic
@@ -1788,16 +1768,15 @@ define void @test104(i32* %ptr, i32 %cmp, i32 %val) {
 ; PPC64LE-LABEL: test104:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB104_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB104_1:
-; PPC64LE-NEXT:    stwcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB104_2:
 ; PPC64LE-NEXT:    lwarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpw 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB104_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB104_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB104_1
+; PPC64LE-NEXT:  .LBB104_3:
 ; PPC64LE-NEXT:    stwcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val syncscope("singlethread") release acquire
@@ -1917,16 +1896,15 @@ define void @test109(i32* %ptr, i32 %cmp, i32 %val) {
 define void @test110(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test110:
 ; PPC64LE:       # %bb.0:
-; PPC64LE-NEXT:    b .LBB110_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB110_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB110_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB110_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB110_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB110_1
+; PPC64LE-NEXT:  .LBB110_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") monotonic monotonic
@@ -1979,16 +1957,15 @@ define void @test113(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test113:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB113_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB113_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB113_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB113_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB113_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB113_1
+; PPC64LE-NEXT:  .LBB113_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release monotonic
@@ -1999,16 +1976,15 @@ define void @test114(i64* %ptr, i64 %cmp, i64 %val) {
 ; PPC64LE-LABEL: test114:
 ; PPC64LE:       # %bb.0:
 ; PPC64LE-NEXT:    lwsync
-; PPC64LE-NEXT:    b .LBB114_2
-; PPC64LE-NEXT:    .p2align 5
 ; PPC64LE-NEXT:  .LBB114_1:
-; PPC64LE-NEXT:    stdcx. 5, 0, 3
-; PPC64LE-NEXT:    beqlr 0
-; PPC64LE-NEXT:  .LBB114_2:
 ; PPC64LE-NEXT:    ldarx 6, 0, 3
 ; PPC64LE-NEXT:    cmpd 4, 6
-; PPC64LE-NEXT:    beq 0, .LBB114_1
-; PPC64LE-NEXT:  # %bb.3:
+; PPC64LE-NEXT:    bne 0, .LBB114_3
+; PPC64LE-NEXT:  # %bb.2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB114_1
+; PPC64LE-NEXT:  .LBB114_3:
 ; PPC64LE-NEXT:    stdcx. 6, 0, 3
 ; PPC64LE-NEXT:    blr
   %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val syncscope("singlethread") release acquire

+ 6 - 5
test/CodeGen/PowerPC/cmp_elimination.ll

@@ -718,13 +718,14 @@ if.end:
 define void @func28(i32 signext %a) {
 ; CHECK-LABEL: @func28
 ; CHECK: cmplwi	 [[REG1:[0-9]+]], [[REG2:[0-9]+]]
-; CHECK: .[[LABEL1:[A-Z0-9_]+]]:
+; CHECK: .[[LABEL2:[A-Z0-9_]+]]:
+; CHECK: cmpwi   [[REG1]], [[REG2]]
+; CHECK: ble     0, .[[LABEL1:[A-Z0-9_]+]]
 ; CHECK-NOT: cmp
-; CHECK: bne	 0, .[[LABEL2:[A-Z0-9_]+]]
+; CHECK: bne     0, .[[LABEL2]]
 ; CHECK: bl dummy1
-; CHECK: .[[LABEL2]]:
-; CHECK: cmpwi	 [[REG1]], [[REG2]]
-; CHECK: bgt	 0, .[[LABEL1]]
+; CHECK: b .[[LABEL2]]
+; CHECK: .[[LABEL1]]:
 ; CHECK: blr
 entry:
   br label %do.body

+ 2 - 1
test/CodeGen/PowerPC/ctrloop-shortLoops.ll

@@ -88,7 +88,8 @@ for.body:                                         ; preds = %entry, %for.body
 ; Function Attrs: norecurse nounwind
 define signext i32 @testTripCount2NonSmallLoop() {
 ; CHECK-LABEL: testTripCount2NonSmallLoop:
-; CHECK: bge
+; CHECK: blt
+; CHECK: beq
 ; CHECK: blr
 
 entry:

+ 5 - 5
test/CodeGen/PowerPC/expand-foldable-isel.ll

@@ -29,13 +29,13 @@ define void @_ZN3pov6ot_insEPPNS_14ot_node_structEPNS_15ot_block_structEPNS_12ot
 ;
 ; CHECK-LABEL: _ZN3pov6ot_insEPPNS_14ot_node_structEPNS_15ot_block_structEPNS_12ot_id_structE:
 ; CHECK:    mr r4, r3
-; CHECK:    bc 12, 4*cr5+lt, .LBB0_3
-; CHECK:   # %bb.2:
+; CHECK:    bc 12, 4*cr5+lt, [[CASE1:.LBB[0-9_]+]]
+; CHECK:   # %bb.
 ; CHECK:    ori r29, r6, 0
-; CHECK:    b .LBB0_4
-; CHECK:  .LBB0_3:
+; CHECK:    b [[MERGE:.LBB[0-9_]+]]
+; CHECK:  [[CASE1]]:
 ; CHECK:    addi r29, r5, 0
-; CHECK:  .LBB0_4:
+; CHECK:  [[MERGE]]:
 ; CHECK:    blr
 entry:
   br label %while.cond11

+ 1 - 1
test/CodeGen/PowerPC/knowCRBitSpill.ll

@@ -86,7 +86,7 @@ define dso_local signext i32 @spillCRUNSET(%struct.p5rx* readonly %p1, i32 signe
 ; CHECK-NOT:    mfocrf [[REG2:.*]], [[CREG]]
 ; CHECK-NOT:    rlwinm [[REG2]], [[REG2]]
 ; CHECK:        stw [[REG1]]
-; CHECK:        .LBB1_1: # %redo_first_pass
+; CHECK:        .LBB1_1:
 entry:
   %and = and i32 %p3, 128
   %tobool = icmp eq i32 %and, 0

+ 1 - 2
test/CodeGen/PowerPC/licm-remat.ll

@@ -24,8 +24,7 @@ define linkonce_odr void @ZN6snappyDecompressor_(%"class.snappy::SnappyDecompres
 ; CHECK-DAG:   addi 25, 3, _ZN6snappy8internalL8wordmaskE@toc@l
 ; CHECK-DAG:   addis 5, 2, _ZN6snappy8internalL10char_tableE@toc@ha
 ; CHECK-DAG:   addi 24, 5, _ZN6snappy8internalL10char_tableE@toc@l
-; CHECK:       b .[[LABEL1:[A-Z0-9_]+]]
-; CHECK:       .[[LABEL1]]: # %for.cond
+; CHECK:       .LBB0_2: # %for.cond
 ; CHECK-NOT:   addis {{[0-9]+}}, 2, _ZN6snappy8internalL8wordmaskE@toc@ha
 ; CHECK-NOT:   addis {{[0-9]+}}, 2, _ZN6snappy8internalL10char_tableE@toc@ha
 ; CHECK:       bctrl

+ 3 - 3
test/CodeGen/SystemZ/atomicrmw-minmax-01.ll

@@ -1,8 +1,8 @@
 ; Test 8-bit atomic min/max operations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT1
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT2
 
 ; Check signed minimum.
 ; - CHECK is for the main loop.

+ 3 - 3
test/CodeGen/SystemZ/atomicrmw-minmax-02.ll

@@ -1,8 +1,8 @@
 ; Test 8-bit atomic min/max operations.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT1
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s -check-prefix=CHECK-SHIFT2
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT1
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s -check-prefix=CHECK-SHIFT2
 
 ; Check signed minimum.
 ; - CHECK is for the main loop.

+ 2 - 2
test/CodeGen/SystemZ/loop-01.ll

@@ -1,7 +1,7 @@
 ; Test loop tuning.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-block-placement | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -disable-block-placement \
 ; RUN:  | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-Z13
 
 ; Test that strength reduction is applied to addresses with a scale factor,

+ 1 - 1
test/CodeGen/SystemZ/loop-02.ll

@@ -1,7 +1,7 @@
 ; Test BRCTH.
 
 ; RUN: llc < %s -verify-machineinstrs -mtriple=s390x-linux-gnu -mcpu=z196 \
-; RUN:   -no-integrated-as | FileCheck %s
+; RUN:   -no-integrated-as -disable-block-placement | FileCheck %s
 
 ; Test a loop that should be converted into dbr form and then use BRCTH.
 define void @f2(i32 *%src, i32 *%dest) {

+ 2 - 2
test/CodeGen/SystemZ/swifterror.ll

@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=s390x-linux-gnu| FileCheck %s
-; RUN: llc < %s -O0 -mtriple=s390x-linux-gnu | FileCheck --check-prefix=CHECK-O0 %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=s390x-linux-gnu -disable-block-placement | FileCheck --check-prefix=CHECK-O0 %s
 
 declare i8* @malloc(i64)
 declare void @free(i8*)

+ 6 - 6
test/CodeGen/Thumb/consthoist-physical-addr.ll

@@ -10,8 +10,9 @@ define i32 @C(i32 %x, i32* nocapture %y) #0 {
 ; CHECK-NEXT:    push {r4, r5, r7, lr}
 ; CHECK-NEXT:    movs r2, #0
 ; CHECK-NEXT:    ldr r3, .LCPI0_0
-; CHECK-NEXT:    b .LBB0_4
 ; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    cmp r2, #128
+; CHECK-NEXT:    beq .LBB0_5
 ; CHECK-NEXT:    movs r4, #0
 ; CHECK-NEXT:    str r4, [r3, #8]
 ; CHECK-NEXT:    lsls r4, r2, #2
@@ -20,16 +21,15 @@ define i32 @C(i32 %x, i32* nocapture %y) #0 {
 ; CHECK-NEXT:    movs r5, #1
 ; CHECK-NEXT:    str r5, [r3, #12]
 ; CHECK-NEXT:    isb sy
-; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:  .LBB0_3:
 ; CHECK-NEXT:    ldr r5, [r3, #12]
 ; CHECK-NEXT:    cmp r5, #0
-; CHECK-NEXT:    bne .LBB0_2
+; CHECK-NEXT:    bne .LBB0_3
 ; CHECK-NEXT:    ldr r5, [r3, #4]
 ; CHECK-NEXT:    str r5, [r1, r4]
 ; CHECK-NEXT:    adds r2, r2, #1
-; CHECK-NEXT:  .LBB0_4:
-; CHECK-NEXT:    cmp r2, #128
-; CHECK-NEXT:    bne .LBB0_1
+; CHECK-NEXT:    b .LBB0_1
+; CHECK-NEXT:  .LBB0_5:
 ; CHECK-NEXT:    movs r0, #0
 ; CHECK-NEXT:    pop {r4, r5, r7, pc}
 ; CHECK-NEXT:    .p2align 2

+ 9 - 10
test/CodeGen/X86/block-placement.ll

@@ -82,14 +82,14 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
 ; Check that we sink cold loop blocks after the hot loop body.
 ; CHECK-LABEL: test_loop_cold_blocks:
 ; CHECK: %entry
-; CHECK-NOT: .p2align
-; CHECK: %unlikely1
-; CHECK-NOT: .p2align
-; CHECK: %unlikely2
 ; CHECK: .p2align
 ; CHECK: %body1
 ; CHECK: %body2
 ; CHECK: %body3
+; CHECK-NOT: .p2align
+; CHECK: %unlikely1
+; CHECK-NOT: .p2align
+; CHECK: %unlikely2
 ; CHECK: %exit
 
 entry:
@@ -125,7 +125,7 @@ exit:
   ret i32 %sum
 }
 
-!0 = !{!"branch_weights", i32 4, i32 64}
+!0 = !{!"branch_weights", i32 1, i32 64}
 
 define i32 @test_loop_early_exits(i32 %i, i32* %a) {
 ; Check that we sink early exit blocks out of loop bodies.
@@ -189,8 +189,8 @@ define i32 @test_loop_rotate(i32 %i, i32* %a) {
 ; loop, eliminating unconditional branches to the top.
 ; CHECK-LABEL: test_loop_rotate:
 ; CHECK: %entry
-; CHECK: %body1
 ; CHECK: %body0
+; CHECK: %body1
 ; CHECK: %exit
 
 entry:
@@ -957,16 +957,15 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
 ; CHECK: %if.else
 ; CHECK: %if.end10
 ; Second rotated loop top
-; CHECK: .p2align
-; CHECK: %if.then24
 ; CHECK: %while.cond.outer
 ; Third rotated loop top
 ; CHECK: .p2align
+; CHECK: %if.end20
 ; CHECK: %while.cond
 ; CHECK: %while.body
 ; CHECK: %land.lhs.true
 ; CHECK: %if.then19
-; CHECK: %if.end20
+; CHECK: %if.then24
 ; CHECK: %if.then8
 ; CHECK: ret
 
@@ -1546,8 +1545,8 @@ define i32 @not_rotate_if_extra_branch_regression(i32 %count, i32 %init) {
 ; CHECK-LABEL: not_rotate_if_extra_branch_regression
 ; CHECK: %.entry
 ; CHECK: %.first_backedge
-; CHECK: %.slow
 ; CHECK: %.second_header
+; CHECK: %.slow
 .entry:
   %sum.0 = shl nsw i32 %count, 1
   br label %.first_header

+ 5 - 2
test/CodeGen/X86/code_placement.ll

@@ -4,6 +4,11 @@
 @Te1 = external global [256 x i32]		; <[256 x i32]*> [#uses=4]
 @Te3 = external global [256 x i32]		; <[256 x i32]*> [#uses=2]
 
+; CHECK: %entry
+; CHECK: %bb
+; CHECK: %bb1
+; CHECK: %bb2
+
 define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp {
 entry:
 	%0 = load i32, i32* %rk, align 4		; <i32> [#uses=1]
@@ -12,8 +17,6 @@ entry:
 	%tmp15 = add i32 %r, -1		; <i32> [#uses=1]
 	%tmp.16 = zext i32 %tmp15 to i64		; <i64> [#uses=2]
 	br label %bb
-; CHECK: jmp
-; CHECK-NEXT: align
 
 bb:		; preds = %bb1, %entry
 	%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ]		; <i64> [#uses=3]

+ 1 - 1
test/CodeGen/X86/code_placement_cold_loop_blocks.ll

@@ -44,8 +44,8 @@ define void @nested_loop_0(i1 %flag) !prof !1 {
 ; CHECK-LABEL: nested_loop_0:
 ; CHECK: callq c
 ; CHECK: callq d
-; CHECK: callq e
 ; CHECK: callq b
+; CHECK: callq e
 ; CHECK: callq f
 
 entry:

+ 3 - 4
test/CodeGen/X86/code_placement_ignore_succ_in_inner_loop.ll

@@ -1,13 +1,12 @@
 ; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
 
 define void @foo() {
-; Test that when determining the edge probability from a node in an inner loop
-; to a node in an outer loop, the weights on edges in the inner loop should be
-; ignored if we are building the chain for the outer loop.
+; After moving the latch to the top of loop, there is no fall through from the
+; latch to outer loop.
 ;
 ; CHECK-LABEL: foo:
-; CHECK: callq c
 ; CHECK: callq b
+; CHECK: callq c
 
 entry:
   %call = call zeroext i1 @a()

+ 7 - 7
test/CodeGen/X86/code_placement_loop_rotation2.ll

@@ -5,13 +5,13 @@ define void @foo() {
 ; Test a nested loop case when profile data is not available.
 ;
 ; CHECK-LABEL: foo:
+; CHECK: callq g
+; CHECK: callq h
 ; CHECK: callq b
-; CHECK: callq c
-; CHECK: callq d
 ; CHECK: callq e
 ; CHECK: callq f
-; CHECK: callq g
-; CHECK: callq h
+; CHECK: callq c
+; CHECK: callq d
 
 entry:
   br label %header
@@ -59,13 +59,13 @@ define void @bar() !prof !1 {
 ; Test a nested loop case when profile data is available.
 ;
 ; CHECK-PROFILE-LABEL: bar:
+; CHECK-PROFILE: callq h
+; CHECK-PROFILE: callq b
+; CHECK-PROFILE: callq g
 ; CHECK-PROFILE: callq e
 ; CHECK-PROFILE: callq f
 ; CHECK-PROFILE: callq c
 ; CHECK-PROFILE: callq d
-; CHECK-PROFILE: callq h
-; CHECK-PROFILE: callq b
-; CHECK-PROFILE: callq g
 
 entry:
   br label %header

+ 1 - 1
test/CodeGen/X86/code_placement_no_header_change.ll

@@ -7,9 +7,9 @@ define i32 @bar(i32 %count) {
 ; Later backedge1 and backedge2 is rotated before loop header.
 ; CHECK-LABEL: bar
 ; CHECK: %.entry
+; CHECK: %.header
 ; CHECK: %.backedge1
 ; CHECK: %.backedge2
-; CHECK: %.header
 ; CHECK: %.exit
 .entry:
   %c = shl nsw i32 %count, 2

+ 89 - 89
test/CodeGen/X86/conditional-tailcall.ll

@@ -258,9 +258,12 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; CHECK32-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK32-NEXT:    xorl %edi, %edi # encoding: [0x31,0xff]
 ; CHECK32-NEXT:    incl %edi # encoding: [0x47]
-; CHECK32-NEXT:    jmp .LBB3_1 # encoding: [0xeb,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  .LBB3_2: # %for.body
+; CHECK32-NEXT:  .LBB3_1: # %for.cond
+; CHECK32-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK32-NEXT:    testl %edx, %edx # encoding: [0x85,0xd2]
+; CHECK32-NEXT:    je .LBB3_13 # encoding: [0x74,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  # %bb.2: # %for.body
 ; CHECK32-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK32-NEXT:    cmpl $2, %ebx # encoding: [0x83,0xfb,0x02]
 ; CHECK32-NEXT:    je .LBB3_11 # encoding: [0x74,A]
@@ -314,12 +317,9 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; CHECK32-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK32-NEXT:    incl %eax # encoding: [0x40]
 ; CHECK32-NEXT:    decl %edx # encoding: [0x4a]
-; CHECK32-NEXT:  .LBB3_1: # %for.cond
-; CHECK32-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK32-NEXT:    testl %edx, %edx # encoding: [0x85,0xd2]
-; CHECK32-NEXT:    jne .LBB3_2 # encoding: [0x75,A]
-; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_2-1, kind: FK_PCRel_1
-; CHECK32-NEXT:  # %bb.13:
+; CHECK32-NEXT:    jmp .LBB3_1 # encoding: [0xeb,A]
+; CHECK32-NEXT:    # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
+; CHECK32-NEXT:  .LBB3_13:
 ; CHECK32-NEXT:    cmpl $2, %ebx # encoding: [0x83,0xfb,0x02]
 ; CHECK32-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK32-NEXT:    jmp .LBB3_14 # encoding: [0xeb,A]
@@ -369,56 +369,59 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; CHECK64-NEXT:    .cfi_adjust_cfa_offset 8
 ; CHECK64-NEXT:    popq %r8 # encoding: [0x41,0x58]
 ; CHECK64-NEXT:    .cfi_adjust_cfa_offset -8
-; CHECK64-NEXT:    jmp .LBB3_11 # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB3_1: # %for.body
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
-; CHECK64-NEXT:    cmpl $2, %ecx # encoding: [0x83,0xf9,0x02]
-; CHECK64-NEXT:    je .LBB3_9 # encoding: [0x74,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB3_1: # %for.cond
+; CHECK64-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK64-NEXT:    testq %rax, %rax # encoding: [0x48,0x85,0xc0]
+; CHECK64-NEXT:    je .LBB3_12 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1
 ; CHECK64-NEXT:  # %bb.2: # %for.body
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
-; CHECK64-NEXT:    cmpl $1, %ecx # encoding: [0x83,0xf9,0x01]
-; CHECK64-NEXT:    je .LBB3_7 # encoding: [0x74,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; CHECK64-NEXT:    cmpl $2, %ecx # encoding: [0x83,0xf9,0x02]
+; CHECK64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
 ; CHECK64-NEXT:  # %bb.3: # %for.body
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; CHECK64-NEXT:    cmpl $1, %ecx # encoding: [0x83,0xf9,0x01]
+; CHECK64-NEXT:    je .LBB3_8 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.4: # %for.body
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    testl %ecx, %ecx # encoding: [0x85,0xc9]
-; CHECK64-NEXT:    jne .LBB3_10 # encoding: [0x75,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.4: # %sw.bb
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:    jne .LBB3_11 # encoding: [0x75,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.5: # %sw.bb
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    movzbl (%rdi), %edx # encoding: [0x0f,0xb6,0x17]
 ; CHECK64-NEXT:    cmpl $43, %edx # encoding: [0x83,0xfa,0x2b]
 ; CHECK64-NEXT:    movl %r8d, %ecx # encoding: [0x44,0x89,0xc1]
-; CHECK64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.5: # %sw.bb
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:    je .LBB3_11 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.6: # %sw.bb
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    cmpb $45, %dl # encoding: [0x80,0xfa,0x2d]
 ; CHECK64-NEXT:    movl %r8d, %ecx # encoding: [0x44,0x89,0xc1]
-; CHECK64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.6: # %if.else
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:    je .LBB3_11 # encoding: [0x74,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  # %bb.7: # %if.else
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    addl $-48, %edx # encoding: [0x83,0xc2,0xd0]
 ; CHECK64-NEXT:    cmpl $10, %edx # encoding: [0x83,0xfa,0x0a]
-; CHECK64-NEXT:    jmp .LBB3_8 # encoding: [0xeb,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB3_7: # %sw.bb14
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:    jmp .LBB3_9 # encoding: [0xeb,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; CHECK64-NEXT:  .LBB3_8: # %sw.bb14
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f]
 ; CHECK64-NEXT:    addl $-48, %ecx # encoding: [0x83,0xc1,0xd0]
 ; CHECK64-NEXT:    cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a]
-; CHECK64-NEXT:  .LBB3_8: # %if.else
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:  .LBB3_9: # %if.else
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    movl %r9d, %ecx # encoding: [0x44,0x89,0xc9]
-; CHECK64-NEXT:    jb .LBB3_10 # encoding: [0x72,A]
-; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
+; CHECK64-NEXT:    jb .LBB3_11 # encoding: [0x72,A]
+; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
 ; CHECK64-NEXT:    jmp .LBB3_13 # encoding: [0xeb,A]
 ; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_13-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB3_9: # %sw.bb22
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:  .LBB3_10: # %sw.bb22
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    movzbl (%rdi), %ecx # encoding: [0x0f,0xb6,0x0f]
 ; CHECK64-NEXT:    addl $-48, %ecx # encoding: [0x83,0xc1,0xd0]
 ; CHECK64-NEXT:    cmpl $10, %ecx # encoding: [0x83,0xf9,0x0a]
@@ -426,16 +429,13 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; CHECK64-NEXT:    jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL
 ; CHECK64-NEXT:    # encoding: [0x73,A]
 ; CHECK64-NEXT:    # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  .LBB3_10: # %for.inc
-; CHECK64-NEXT:    # in Loop: Header=BB3_11 Depth=1
+; CHECK64-NEXT:  .LBB3_11: # %for.inc
+; CHECK64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; CHECK64-NEXT:    incq %rdi # encoding: [0x48,0xff,0xc7]
 ; CHECK64-NEXT:    decq %rax # encoding: [0x48,0xff,0xc8]
-; CHECK64-NEXT:  .LBB3_11: # %for.cond
-; CHECK64-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK64-NEXT:    testq %rax, %rax # encoding: [0x48,0x85,0xc0]
-; CHECK64-NEXT:    jne .LBB3_1 # encoding: [0x75,A]
+; CHECK64-NEXT:    jmp .LBB3_1 # encoding: [0xeb,A]
 ; CHECK64-NEXT:    # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
-; CHECK64-NEXT:  # %bb.12:
+; CHECK64-NEXT:  .LBB3_12:
 ; CHECK64-NEXT:    cmpl $2, %ecx # encoding: [0x83,0xf9,0x02]
 ; CHECK64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK64-NEXT:    # kill: def $al killed $al killed $eax
@@ -451,51 +451,54 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; WIN64-NEXT:    movq -24(%rcx), %r8 # encoding: [0x4c,0x8b,0x41,0xe8]
 ; WIN64-NEXT:    leaq (%rcx,%r8), %rdx # encoding: [0x4a,0x8d,0x14,0x01]
 ; WIN64-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; WIN64-NEXT:    jmp .LBB3_10 # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB3_1: # %for.body
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
-; WIN64-NEXT:    cmpl $2, %eax # encoding: [0x83,0xf8,0x02]
-; WIN64-NEXT:    je .LBB3_8 # encoding: [0x74,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB3_1: # %for.cond
+; WIN64-NEXT:    # =>This Inner Loop Header: Depth=1
+; WIN64-NEXT:    testq %r8, %r8 # encoding: [0x4d,0x85,0xc0]
+; WIN64-NEXT:    je .LBB3_11 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_11-1, kind: FK_PCRel_1
 ; WIN64-NEXT:  # %bb.2: # %for.body
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
-; WIN64-NEXT:    cmpl $1, %eax # encoding: [0x83,0xf8,0x01]
-; WIN64-NEXT:    je .LBB3_6 # encoding: [0x74,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_6-1, kind: FK_PCRel_1
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; WIN64-NEXT:    cmpl $2, %eax # encoding: [0x83,0xf8,0x02]
+; WIN64-NEXT:    je .LBB3_9 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
 ; WIN64-NEXT:  # %bb.3: # %for.body
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
+; WIN64-NEXT:    cmpl $1, %eax # encoding: [0x83,0xf8,0x01]
+; WIN64-NEXT:    je .LBB3_7 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.4: # %for.body
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    testl %eax, %eax # encoding: [0x85,0xc0]
-; WIN64-NEXT:    jne .LBB3_9 # encoding: [0x75,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
-; WIN64-NEXT:  # %bb.4: # %sw.bb
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:    jne .LBB3_10 # encoding: [0x75,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.5: # %sw.bb
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
 ; WIN64-NEXT:    cmpl $43, %r9d # encoding: [0x41,0x83,0xf9,0x2b]
 ; WIN64-NEXT:    movl $1, %eax # encoding: [0xb8,0x01,0x00,0x00,0x00]
-; WIN64-NEXT:    je .LBB3_9 # encoding: [0x74,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
-; WIN64-NEXT:  # %bb.5: # %sw.bb
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
+; WIN64-NEXT:  # %bb.6: # %sw.bb
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    cmpb $45, %r9b # encoding: [0x41,0x80,0xf9,0x2d]
-; WIN64-NEXT:    je .LBB3_9 # encoding: [0x74,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
-; WIN64-NEXT:    jmp .LBB3_7 # encoding: [0xeb,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_7-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB3_6: # %sw.bb14
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:    je .LBB3_10 # encoding: [0x74,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
+; WIN64-NEXT:    jmp .LBB3_8 # encoding: [0xeb,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_8-1, kind: FK_PCRel_1
+; WIN64-NEXT:  .LBB3_7: # %sw.bb14
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
-; WIN64-NEXT:  .LBB3_7: # %if.else
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:  .LBB3_8: # %if.else
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0]
 ; WIN64-NEXT:    movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
 ; WIN64-NEXT:    cmpl $10, %r9d # encoding: [0x41,0x83,0xf9,0x0a]
-; WIN64-NEXT:    jb .LBB3_9 # encoding: [0x72,A]
-; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_9-1, kind: FK_PCRel_1
+; WIN64-NEXT:    jb .LBB3_10 # encoding: [0x72,A]
+; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_10-1, kind: FK_PCRel_1
 ; WIN64-NEXT:    jmp .LBB3_12 # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_12-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB3_8: # %sw.bb22
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:  .LBB3_9: # %sw.bb22
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    movzbl (%rcx), %r9d # encoding: [0x44,0x0f,0xb6,0x09]
 ; WIN64-NEXT:    addl $-48, %r9d # encoding: [0x41,0x83,0xc1,0xd0]
 ; WIN64-NEXT:    movl $2, %eax # encoding: [0xb8,0x02,0x00,0x00,0x00]
@@ -503,16 +506,13 @@ define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly derefe
 ; WIN64-NEXT:    jae _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_ # TAILCALL
 ; WIN64-NEXT:    # encoding: [0x73,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_-1, kind: FK_PCRel_1
-; WIN64-NEXT:  .LBB3_9: # %for.inc
-; WIN64-NEXT:    # in Loop: Header=BB3_10 Depth=1
+; WIN64-NEXT:  .LBB3_10: # %for.inc
+; WIN64-NEXT:    # in Loop: Header=BB3_1 Depth=1
 ; WIN64-NEXT:    incq %rcx # encoding: [0x48,0xff,0xc1]
 ; WIN64-NEXT:    decq %r8 # encoding: [0x49,0xff,0xc8]
-; WIN64-NEXT:  .LBB3_10: # %for.cond
-; WIN64-NEXT:    # =>This Inner Loop Header: Depth=1
-; WIN64-NEXT:    testq %r8, %r8 # encoding: [0x4d,0x85,0xc0]
-; WIN64-NEXT:    jne .LBB3_1 # encoding: [0x75,A]
+; WIN64-NEXT:    jmp .LBB3_1 # encoding: [0xeb,A]
 ; WIN64-NEXT:    # fixup A - offset: 1, value: .LBB3_1-1, kind: FK_PCRel_1
-; WIN64-NEXT:  # %bb.11:
+; WIN64-NEXT:  .LBB3_11:
 ; WIN64-NEXT:    cmpl $2, %eax # encoding: [0x83,0xf8,0x02]
 ; WIN64-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; WIN64-NEXT:    # kill: def $al killed $al killed $eax

+ 22 - 16
test/CodeGen/X86/loop-blocks.ll

@@ -7,12 +7,14 @@
 ; order to avoid a branch within the loop.
 
 ; CHECK-LABEL: simple:
-;      CHECK:   jmp   .LBB0_1
-; CHECK-NEXT:   align
-; CHECK-NEXT: .LBB0_2:
-; CHECK-NEXT:   callq loop_latch
+;      CHECK:   align
 ; CHECK-NEXT: .LBB0_1:
 ; CHECK-NEXT:   callq loop_header
+;      CHECK:   js .LBB0_3
+; CHECK-NEXT:   callq loop_latch
+; CHECK-NEXT:   jmp .LBB0_1
+; CHECK-NEXT: .LBB0_3:
+; CHECK-NEXT:   callq exit
 
 define void @simple() nounwind {
 entry:
@@ -75,17 +77,21 @@ exit:
 ; CHECK-LABEL: yet_more_involved:
 ;      CHECK:   jmp .LBB2_1
 ; CHECK-NEXT:   align
-; CHECK-NEXT: .LBB2_5:
-; CHECK-NEXT:   callq block_a_true_func
-; CHECK-NEXT:   callq block_a_merge_func
-; CHECK-NEXT: .LBB2_1:
+
+;      CHECK: .LBB2_1:
 ; CHECK-NEXT:   callq body
-;
-; LBB2_4
-;      CHECK:   callq bar99
+; CHECK-NEXT:   callq get
+; CHECK-NEXT:   cmpl $2, %eax
+; CHECK-NEXT:   jge .LBB2_2
+; CHECK-NEXT:   callq bar99
 ; CHECK-NEXT:   callq get
 ; CHECK-NEXT:   cmpl $2999, %eax
-; CHECK-NEXT:   jle .LBB2_5
+; CHECK-NEXT:   jg .LBB2_6
+; CHECK-NEXT:   callq block_a_true_func
+; CHECK-NEXT:   callq block_a_merge_func
+; CHECK-NEXT:   jmp .LBB2_1
+; CHECK-NEXT:   align
+; CHECK-NEXT: .LBB2_6:
 ; CHECK-NEXT:   callq block_a_false_func
 ; CHECK-NEXT:   callq block_a_merge_func
 ; CHECK-NEXT:   jmp .LBB2_1
@@ -201,12 +207,12 @@ block102:
 }
 
 ; CHECK-LABEL: check_minsize:
-;      CHECK:   jmp   .LBB4_1
 ; CHECK-NOT:   align
-; CHECK-NEXT: .LBB4_2:
-; CHECK-NEXT:   callq loop_latch
-; CHECK-NEXT: .LBB4_1:
+; CHECK:      .LBB4_1:
 ; CHECK-NEXT:   callq loop_header
+; CHECK:        callq loop_latch
+; CHECK:      .LBB4_3:
+; CHECK:        callq exit
 
 
 define void @check_minsize() minsize nounwind {

+ 120 - 0
test/CodeGen/X86/loop-rotate.ll

@@ -0,0 +1,120 @@
+; RUN: llc -mtriple=i686-linux < %s | FileCheck %s
+
+; Don't rotate the loop if the number of fall through to exit is not larger
+; than the number of fall through to header.
+define void @no_rotate() {
+; CHECK-LABEL: no_rotate
+; CHECK: %entry
+; CHECK: %header
+; CHECK: %middle
+; CHECK: %latch1
+; CHECK: %latch2
+; CHECK: %end
+entry:
+  br label %header
+
+header:
+  %val1 = call i1 @foo()
+  br i1 %val1, label %middle, label %end
+
+middle:
+  %val2 = call i1 @foo()
+  br i1 %val2, label %latch1, label %end
+
+latch1:
+  %val3 = call i1 @foo()
+  br i1 %val3, label %latch2, label %header
+
+latch2:
+  %val4 = call i1 @foo()
+  br label %header
+
+end:
+  ret void
+}
+
+define void @do_rotate() {
+; CHECK-LABEL: do_rotate
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %else
+; CHECK: %latch1
+; CHECK: %latch2
+; CHECK: %header
+; CHECK: %end
+entry:
+  %val0 = call i1 @foo()
+  br i1 %val0, label %then, label %else
+
+then:
+  call void @a()
+  br label %header
+
+else:
+  call void @b()
+  br label %header
+
+header:
+  %val1 = call i1 @foo()
+  br i1 %val1, label %latch1, label %end
+
+latch1:
+  %val3 = call i1 @foo()
+  br i1 %val3, label %latch2, label %header
+
+latch2:
+  %val4 = call i1 @foo()
+  br label %header
+
+end:
+  ret void
+}
+
+; The loop structure is same as in @no_rotate, but the loop header's predecessor
+; doesn't fall through to it, so it should be rotated to get exit fall through.
+define void @do_rotate2() {
+; CHECK-LABEL: do_rotate2
+; CHECK: %entry
+; CHECK: %then
+; CHECK: %middle
+; CHECK: %latch1
+; CHECK: %latch2
+; CHECK: %header
+; CHECK: %exit
+entry:
+  %val0 = call i1 @foo()
+  br i1 %val0, label %then, label %header, !prof !1
+
+then:
+  call void @a()
+  br label %end
+
+header:
+  %val1 = call i1 @foo()
+  br i1 %val1, label %middle, label %exit
+
+middle:
+  %val2 = call i1 @foo()
+  br i1 %val2, label %latch1, label %exit
+
+latch1:
+  %val3 = call i1 @foo()
+  br i1 %val3, label %latch2, label %header
+
+latch2:
+  %val4 = call i1 @foo()
+  br label %header
+
+exit:
+  call void @b()
+  br label %end
+
+end:
+  ret void
+}
+
+declare i1 @foo()
+declare void @a()
+declare void @b()
+
+!1 = !{!"branch_weights", i32 10, i32 1}

+ 33 - 33
test/CodeGen/X86/lsr-loop-exit-cond.ll

@@ -21,22 +21,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movq _Te1@{{.*}}(%rip), %r8
 ; GENERIC-NEXT:    movq _Te3@{{.*}}(%rip), %r10
 ; GENERIC-NEXT:    movq %rcx, %r11
-; GENERIC-NEXT:    jmp LBB0_1
 ; GENERIC-NEXT:    .p2align 4, 0x90
-; GENERIC-NEXT:  LBB0_2: ## %bb1
-; GENERIC-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; GENERIC-NEXT:    movl %edi, %ebx
-; GENERIC-NEXT:    shrl $16, %ebx
-; GENERIC-NEXT:    movzbl %bl, %ebx
-; GENERIC-NEXT:    xorl (%r8,%rbx,4), %eax
-; GENERIC-NEXT:    xorl -4(%r14), %eax
-; GENERIC-NEXT:    shrl $24, %edi
-; GENERIC-NEXT:    movzbl %bpl, %ebx
-; GENERIC-NEXT:    movl (%r10,%rbx,4), %ebx
-; GENERIC-NEXT:    xorl (%r9,%rdi,4), %ebx
-; GENERIC-NEXT:    xorl (%r14), %ebx
-; GENERIC-NEXT:    decq %r11
-; GENERIC-NEXT:    addq $16, %r14
 ; GENERIC-NEXT:  LBB0_1: ## %bb
 ; GENERIC-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; GENERIC-NEXT:    movzbl %al, %edi
@@ -56,8 +41,23 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    shrl $24, %eax
 ; GENERIC-NEXT:    movl (%r9,%rax,4), %eax
 ; GENERIC-NEXT:    testq %r11, %r11
-; GENERIC-NEXT:    jne LBB0_2
-; GENERIC-NEXT:  ## %bb.3: ## %bb2
+; GENERIC-NEXT:    je LBB0_3
+; GENERIC-NEXT:  ## %bb.2: ## %bb1
+; GENERIC-NEXT:    ## in Loop: Header=BB0_1 Depth=1
+; GENERIC-NEXT:    movl %edi, %ebx
+; GENERIC-NEXT:    shrl $16, %ebx
+; GENERIC-NEXT:    movzbl %bl, %ebx
+; GENERIC-NEXT:    xorl (%r8,%rbx,4), %eax
+; GENERIC-NEXT:    xorl -4(%r14), %eax
+; GENERIC-NEXT:    shrl $24, %edi
+; GENERIC-NEXT:    movzbl %bpl, %ebx
+; GENERIC-NEXT:    movl (%r10,%rbx,4), %ebx
+; GENERIC-NEXT:    xorl (%r9,%rdi,4), %ebx
+; GENERIC-NEXT:    xorl (%r14), %ebx
+; GENERIC-NEXT:    decq %r11
+; GENERIC-NEXT:    addq $16, %r14
+; GENERIC-NEXT:    jmp LBB0_1
+; GENERIC-NEXT:  LBB0_3: ## %bb2
 ; GENERIC-NEXT:    shlq $4, %rcx
 ; GENERIC-NEXT:    andl $-16777216, %eax ## imm = 0xFF000000
 ; GENERIC-NEXT:    movl %edi, %ebx
@@ -105,21 +105,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movq _Te3@{{.*}}(%rip), %r10
 ; ATOM-NEXT:    decl %ecx
 ; ATOM-NEXT:    movq %rcx, %r11
-; ATOM-NEXT:    jmp LBB0_1
 ; ATOM-NEXT:    .p2align 4, 0x90
-; ATOM-NEXT:  LBB0_2: ## %bb1
-; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; ATOM-NEXT:    shrl $16, %eax
-; ATOM-NEXT:    shrl $24, %edi
-; ATOM-NEXT:    decq %r11
-; ATOM-NEXT:    movzbl %al, %ebp
-; ATOM-NEXT:    movzbl %bl, %eax
-; ATOM-NEXT:    movl (%r10,%rax,4), %eax
-; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
-; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
-; ATOM-NEXT:    xorl -4(%r14), %r15d
-; ATOM-NEXT:    xorl (%r14), %eax
-; ATOM-NEXT:    addq $16, %r14
 ; ATOM-NEXT:  LBB0_1: ## %bb
 ; ATOM-NEXT:    ## =>This Inner Loop Header: Depth=1
 ; ATOM-NEXT:    movl %eax, %edi
@@ -140,8 +126,22 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movl (%r9,%rax,4), %r15d
 ; ATOM-NEXT:    testq %r11, %r11
 ; ATOM-NEXT:    movl %edi, %eax
-; ATOM-NEXT:    jne LBB0_2
-; ATOM-NEXT:  ## %bb.3: ## %bb2
+; ATOM-NEXT:    je LBB0_3
+; ATOM-NEXT:  ## %bb.2: ## %bb1
+; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
+; ATOM-NEXT:    shrl $16, %eax
+; ATOM-NEXT:    shrl $24, %edi
+; ATOM-NEXT:    decq %r11
+; ATOM-NEXT:    movzbl %al, %ebp
+; ATOM-NEXT:    movzbl %bl, %eax
+; ATOM-NEXT:    movl (%r10,%rax,4), %eax
+; ATOM-NEXT:    xorl (%r8,%rbp,4), %r15d
+; ATOM-NEXT:    xorl (%r9,%rdi,4), %eax
+; ATOM-NEXT:    xorl -4(%r14), %r15d
+; ATOM-NEXT:    xorl (%r14), %eax
+; ATOM-NEXT:    addq $16, %r14
+; ATOM-NEXT:    jmp LBB0_1
+; ATOM-NEXT:  LBB0_3: ## %bb2
 ; ATOM-NEXT:    shrl $16, %eax
 ; ATOM-NEXT:    shrl $8, %edi
 ; ATOM-NEXT:    movzbl %bl, %ebp

+ 239 - 0
test/CodeGen/X86/move_latch_to_loop_top.ll

@@ -0,0 +1,239 @@
+; RUN: llc  -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
+
+; The block latch should be moved before header.
+;CHECK-LABEL: test1:
+;CHECK:       %latch
+;CHECK:       %header
+;CHECK:       %false
+define i32 @test1(i32* %p) {
+entry:
+  br label %header
+
+header:
+  %x1 = phi i64 [0, %entry], [%x2, %latch]
+  %count1 = phi i32 [0, %entry], [%count4, %latch]
+  %0 = ptrtoint i32* %p to i64
+  %1 = add i64 %0, %x1
+  %2 = inttoptr i64 %1 to i32*
+  %data = load i32, i32* %2
+  %3 = icmp eq i32 %data, 0
+  br i1 %3, label %latch, label %false
+
+false:
+  %count2 = add i32 %count1, 1
+  br label %latch
+
+latch:
+  %count4 = phi i32 [%count2, %false], [%count1, %header]
+  %x2 = add i64 %x1, 1
+  %4 = icmp eq i64 %x2, 100
+  br i1 %4, label %exit, label %header
+
+exit:
+  ret i32 %count4
+}
+
+; The block latch and one of false/true should be moved before header.
+;CHECK-LABEL: test2:
+;CHECK:       %true
+;CHECK:       %latch
+;CHECK:       %header
+;CHECK:       %false
+define i32 @test2(i32* %p) {
+entry:
+  br label %header
+
+header:
+  %x1 = phi i64 [0, %entry], [%x2, %latch]
+  %count1 = phi i32 [0, %entry], [%count4, %latch]
+  %0 = ptrtoint i32* %p to i64
+  %1 = add i64 %0, %x1
+  %2 = inttoptr i64 %1 to i32*
+  %data = load i32, i32* %2
+  %3 = icmp eq i32 %data, 0
+  br i1 %3, label %true, label %false
+
+false:
+  %count2 = add i32 %count1, 1
+  br label %latch
+
+true:
+  %count3 = add i32 %count1, 2
+  br label %latch
+
+latch:
+  %count4 = phi i32 [%count2, %false], [%count3, %true]
+  %x2 = add i64 %x1, 1
+  %4 = icmp eq i64 %x2, 100
+  br i1 %4, label %exit, label %header
+
+exit:
+  ret i32 %count4
+}
+
+; More blocks can be moved before header.
+;            header <------------
+;              /\               |
+;             /  \              |
+;            /    \             |
+;           /      \            |
+;          /        \           |
+;        true      false        |
+;         /\         /\         |
+;        /  \       /  \        |
+;       /    \     /    \       |
+;    true3 false3 /      \      |
+;      \    /   true2  false2   |
+;       \  /      \      /      |
+;        \/        \    /       |
+;      endif3       \  /        |
+;         \          \/         |
+;          \       endif2       |
+;           \        /          |
+;            \      /           |
+;             \    /            |
+;              \  /             |
+;               \/              |
+;              latch-------------
+;                |
+;                |
+;              exit
+;
+; Blocks true3,endif3,latch should be moved before header.
+;
+;CHECK-LABEL: test3:
+;CHECK:       %true3
+;CHECK:       %endif3
+;CHECK:       %latch
+;CHECK:       %header
+;CHECK:       %false
+define i32 @test3(i32* %p) {
+entry:
+  br label %header
+
+header:
+  %x1 = phi i64 [0, %entry], [%x2, %latch]
+  %count1 = phi i32 [0, %entry], [%count12, %latch]
+  %0 = ptrtoint i32* %p to i64
+  %1 = add i64 %0, %x1
+  %2 = inttoptr i64 %1 to i32*
+  %data = load i32, i32* %2
+  %3 = icmp eq i32 %data, 0
+  br i1 %3, label %true, label %false, !prof !3
+
+false:
+  %count2 = add i32 %count1, 1
+  %cond = icmp sgt i32 %count2, 10
+  br i1 %cond, label %true2, label %false2
+
+false2:
+  %count3 = and i32 %count2, 7
+  br label %endif2
+
+true2:
+  %count4 = mul i32 %count2, 3
+  br label %endif2
+
+endif2:
+  %count5 = phi i32 [%count3, %false2], [%count4, %true2]
+  %count6 = sub i32 %count5, 5
+  br label %latch
+
+true:
+  %count7 = add i32 %count1, 2
+  %cond2 = icmp slt i32 %count7, 20
+  br i1 %cond2, label %true3, label %false3
+
+false3:
+  %count8 = or i32 %count7, 3
+  br label %endif3
+
+true3:
+  %count9 = xor i32 %count7, 55
+  br label %endif3
+
+endif3:
+  %count10 = phi i32 [%count8, %false3], [%count9, %true3]
+  %count11 = add i32 %count10, 3
+  br label %latch
+
+latch:
+  %count12 = phi i32 [%count6, %endif2], [%count11, %endif3]
+  %x2 = add i64 %x1, 1
+  %4 = icmp eq i64 %x2, 100
+  br i1 %4, label %exit, label %header
+
+exit:
+  ret i32 %count12
+}
+
+; The exit block has higher frequency than false block, so latch block
+; should not moved before header.
+;CHECK-LABEL: test4:
+;CHECK:       %header
+;CHECK:       %true
+;CHECK:       %latch
+;CHECK:       %false
+;CHECK:       %exit
+define i32 @test4(i32 %t, i32* %p) {
+entry:
+  br label %header
+
+header:
+  %x1 = phi i64 [0, %entry], [%x2, %latch]
+  %count1 = phi i32 [0, %entry], [%count4, %latch]
+  %0 = ptrtoint i32* %p to i64
+  %1 = add i64 %0, %x1
+  %2 = inttoptr i64 %1 to i32*
+  %data = load i32, i32* %2
+  %3 = icmp eq i32 %data, 0
+  br i1 %3, label %true, label %false, !prof !1
+
+false:
+  %count2 = add i32 %count1, 1
+  br label %latch
+
+true:
+  %count3 = add i32 %count1, 2
+  br label %latch
+
+latch:
+  %count4 = phi i32 [%count2, %false], [%count3, %true]
+  %x2 = add i64 %x1, 1
+  %4 = icmp eq i64 %x2, 100
+  br i1 %4, label %exit, label %header, !prof !2
+
+exit:
+  ret i32 %count4
+}
+
+!1 = !{!"branch_weights", i32 100, i32 1}
+!2 = !{!"branch_weights", i32 16, i32 16}
+!3 = !{!"branch_weights", i32 51, i32 49}
+
+; If move latch to loop top doesn't reduce taken branch, don't do it.
+;CHECK-LABEL: test5:
+;CHECK:       %entry
+;CHECK:       %header
+;CHECK:       %latch
+define void @test5(i32* %p) {
+entry:
+  br label %header
+
+header:
+  %x1 = phi i64 [0, %entry], [%x1, %header], [%x2, %latch]
+  %0 = ptrtoint i32* %p to i64
+  %1 = add i64 %0, %x1
+  %2 = inttoptr i64 %1 to i32*
+  %data = load i32, i32* %2
+  %3 = icmp eq i32 %data, 0
+  br i1 %3, label %latch, label %header
+
+latch:
+  %x2 = add i64 %x1, 1
+  br label %header
+
+exit:
+  ret void
+}
+

+ 8 - 8
test/CodeGen/X86/pr38185.ll

@@ -5,9 +5,13 @@ define void @foo(i32* %a, i32* %b, i32* noalias %c, i64 %s) {
 ; CHECK-LABEL: foo:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %body
+; CHECK-NEXT:  .LBB0_1: # %loop
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %r9
+; CHECK-NEXT:    cmpq %rcx, %r9
+; CHECK-NEXT:    je .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %body
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $1, (%rdx,%r9,4)
 ; CHECK-NEXT:    movzbl (%rdi,%r9,4), %r8d
@@ -17,12 +21,8 @@ define void @foo(i32* %a, i32* %b, i32* noalias %c, i64 %s) {
 ; CHECK-NEXT:    movl %eax, (%rdi,%r9,4)
 ; CHECK-NEXT:    incq %r9
 ; CHECK-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:  .LBB0_1: # %loop
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT:    cmpq %rcx, %r9
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %endloop
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %endloop
 ; CHECK-NEXT:    retq
 %i = alloca i64
 store i64 0, i64* %i

+ 63 - 60
test/CodeGen/X86/ragreedy-hoist-spill.ll

@@ -103,6 +103,34 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    xorl %r13d, %r13d
 ; CHECK-NEXT:    jmp LBB0_13
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB0_20: ## %sw.bb256
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movl %r14d, %r13d
+; CHECK-NEXT:  LBB0_21: ## %while.cond197.backedge
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    decl %r15d
+; CHECK-NEXT:    testl %r15d, %r15d
+; CHECK-NEXT:    movl %r13d, %r14d
+; CHECK-NEXT:    jle LBB0_22
+; CHECK-NEXT:  LBB0_13: ## %while.body200
+; CHECK-NEXT:    ## =>This Loop Header: Depth=1
+; CHECK-NEXT:    ## Child Loop BB0_30 Depth 2
+; CHECK-NEXT:    ## Child Loop BB0_38 Depth 2
+; CHECK-NEXT:    leal -268(%r14), %eax
+; CHECK-NEXT:    cmpl $105, %eax
+; CHECK-NEXT:    ja LBB0_14
+; CHECK-NEXT:  ## %bb.56: ## %while.body200
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movslq (%rdi,%rax,4), %rax
+; CHECK-NEXT:    addq %rdi, %rax
+; CHECK-NEXT:    jmpq *%rax
+; CHECK-NEXT:  LBB0_44: ## %while.cond1037.preheader
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    movl %r14d, %r13d
+; CHECK-NEXT:    jne LBB0_21
+; CHECK-NEXT:    jmp LBB0_55
+; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_14: ## %while.body200
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    leal 1(%r14), %eax
@@ -118,12 +146,6 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    movl $1, %r13d
 ; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_44: ## %while.cond1037.preheader
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    movl %r14d, %r13d
-; CHECK-NEXT:    jne LBB0_21
-; CHECK-NEXT:    jmp LBB0_55
 ; CHECK-NEXT:  LBB0_26: ## %sw.bb474
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    testb %dl, %dl
@@ -137,30 +159,52 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:  ## %bb.28: ## %land.rhs485.preheader
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    ## implicit-def: $rax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jns LBB0_30
+; CHECK-NEXT:    jmp LBB0_55
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_29: ## %land.rhs485
-; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=1
-; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
+; CHECK-NEXT:  LBB0_32: ## %do.body479.backedge
+; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
+; CHECK-NEXT:    leaq 1(%r12), %rax
+; CHECK-NEXT:    testb %dl, %dl
+; CHECK-NEXT:    je LBB0_33
+; CHECK-NEXT:  ## %bb.29: ## %land.rhs485
+; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    js LBB0_55
-; CHECK-NEXT:  ## %bb.30: ## %cond.true.i.i2780
-; CHECK-NEXT:    ## in Loop: Header=BB0_29 Depth=2
+; CHECK-NEXT:  LBB0_30: ## %cond.true.i.i2780
+; CHECK-NEXT:    ## Parent Loop BB0_13 Depth=1
+; CHECK-NEXT:    ## => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    movq %rax, %r12
 ; CHECK-NEXT:    testb %dl, %dl
 ; CHECK-NEXT:    jne LBB0_32
 ; CHECK-NEXT:  ## %bb.31: ## %lor.rhs500
-; CHECK-NEXT:    ## in Loop: Header=BB0_29 Depth=2
+; CHECK-NEXT:    ## in Loop: Header=BB0_30 Depth=2
 ; CHECK-NEXT:    movl $256, %esi ## imm = 0x100
 ; CHECK-NEXT:    callq ___maskrune
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    je LBB0_34
-; CHECK-NEXT:  LBB0_32: ## %do.body479.backedge
-; CHECK-NEXT:    ## in Loop: Header=BB0_29 Depth=2
-; CHECK-NEXT:    leaq 1(%r12), %rax
-; CHECK-NEXT:    testb %dl, %dl
-; CHECK-NEXT:    jne LBB0_29
-; CHECK-NEXT:  ## %bb.33: ## %if.end517.loopexitsplit
+; CHECK-NEXT:    jne LBB0_32
+; CHECK-NEXT:    jmp LBB0_34
+; CHECK-NEXT:  LBB0_45: ## %sw.bb1134
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    cmpq %rax, %rcx
+; CHECK-NEXT:    jb LBB0_55
+; CHECK-NEXT:  ## %bb.46: ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    xorl %ebp, %ebp
+; CHECK-NEXT:    movl $268, %r13d ## imm = 0x10C
+; CHECK-NEXT:    jmp LBB0_21
+; CHECK-NEXT:  LBB0_19: ## %sw.bb243
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movl $2, %r13d
+; CHECK-NEXT:    jmp LBB0_21
+; CHECK-NEXT:  LBB0_40: ## %sw.bb566
+; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
+; CHECK-NEXT:    movl $20, %r13d
+; CHECK-NEXT:    jmp LBB0_21
+; CHECK-NEXT:  LBB0_33: ## %if.end517.loopexitsplit
 ; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
 ; CHECK-NEXT:    incq %r12
 ; CHECK-NEXT:  LBB0_34: ## %if.end517
@@ -199,47 +243,6 @@ define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rsi
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
 ; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_45: ## %sw.bb1134
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    cmpq %rax, %rcx
-; CHECK-NEXT:    jb LBB0_55
-; CHECK-NEXT:  ## %bb.46: ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    movl $268, %r13d ## imm = 0x10C
-; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_19: ## %sw.bb243
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $2, %r13d
-; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:  LBB0_40: ## %sw.bb566
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl $20, %r13d
-; CHECK-NEXT:    jmp LBB0_21
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_13: ## %while.body200
-; CHECK-NEXT:    ## =>This Loop Header: Depth=1
-; CHECK-NEXT:    ## Child Loop BB0_29 Depth 2
-; CHECK-NEXT:    ## Child Loop BB0_38 Depth 2
-; CHECK-NEXT:    leal -268(%r14), %eax
-; CHECK-NEXT:    cmpl $105, %eax
-; CHECK-NEXT:    ja LBB0_14
-; CHECK-NEXT:  ## %bb.56: ## %while.body200
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movslq (%rdi,%rax,4), %rax
-; CHECK-NEXT:    addq %rdi, %rax
-; CHECK-NEXT:    jmpq *%rax
-; CHECK-NEXT:  LBB0_20: ## %sw.bb256
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    movl %r14d, %r13d
-; CHECK-NEXT:  LBB0_21: ## %while.cond197.backedge
-; CHECK-NEXT:    ## in Loop: Header=BB0_13 Depth=1
-; CHECK-NEXT:    decl %r15d
-; CHECK-NEXT:    testl %r15d, %r15d
-; CHECK-NEXT:    movl %r13d, %r14d
-; CHECK-NEXT:    jg LBB0_13
-; CHECK-NEXT:    jmp LBB0_22
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_42: ## %while.cond864
 ; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1

+ 18 - 17
test/CodeGen/X86/reverse_branches.ll

@@ -85,25 +85,36 @@ define i32 @test_branches_order() uwtable ssp {
 ; CHECK-NEXT:    jg LBB0_16
 ; CHECK-NEXT:  LBB0_9: ## %for.cond18.preheader
 ; CHECK-NEXT:    ## =>This Loop Header: Depth=1
-; CHECK-NEXT:    ## Child Loop BB0_10 Depth 2
+; CHECK-NEXT:    ## Child Loop BB0_11 Depth 2
 ; CHECK-NEXT:    ## Child Loop BB0_12 Depth 3
 ; CHECK-NEXT:    movq %rcx, %rdx
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    xorl %edi, %edi
+; CHECK-NEXT:    cmpl $999, %edi ## imm = 0x3E7
+; CHECK-NEXT:    jle LBB0_11
+; CHECK-NEXT:    jmp LBB0_15
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_10: ## %for.cond18
+; CHECK-NEXT:  LBB0_14: ## %exit
+; CHECK-NEXT:    ## in Loop: Header=BB0_11 Depth=2
+; CHECK-NEXT:    addq %rsi, %rbp
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    decq %rsi
+; CHECK-NEXT:    addq $1001, %rdx ## imm = 0x3E9
+; CHECK-NEXT:    cmpq $-1000, %rbp ## imm = 0xFC18
+; CHECK-NEXT:    jne LBB0_5
+; CHECK-NEXT:  ## %bb.10: ## %for.cond18
+; CHECK-NEXT:    ## in Loop: Header=BB0_11 Depth=2
+; CHECK-NEXT:    cmpl $999, %edi ## imm = 0x3E7
+; CHECK-NEXT:    jg LBB0_15
+; CHECK-NEXT:  LBB0_11: ## %for.body20
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
 ; CHECK-NEXT:    ## => This Loop Header: Depth=2
 ; CHECK-NEXT:    ## Child Loop BB0_12 Depth 3
-; CHECK-NEXT:    cmpl $999, %edi ## imm = 0x3E7
-; CHECK-NEXT:    jg LBB0_15
-; CHECK-NEXT:  ## %bb.11: ## %for.body20
-; CHECK-NEXT:    ## in Loop: Header=BB0_10 Depth=2
 ; CHECK-NEXT:    movq $-1000, %rbp ## imm = 0xFC18
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_12: ## %do.body.i
 ; CHECK-NEXT:    ## Parent Loop BB0_9 Depth=1
-; CHECK-NEXT:    ## Parent Loop BB0_10 Depth=2
+; CHECK-NEXT:    ## Parent Loop BB0_11 Depth=2
 ; CHECK-NEXT:    ## => This Inner Loop Header: Depth=3
 ; CHECK-NEXT:    cmpb $120, 1000(%rdx,%rbp)
 ; CHECK-NEXT:    je LBB0_14
@@ -111,16 +122,6 @@ define i32 @test_branches_order() uwtable ssp {
 ; CHECK-NEXT:    ## in Loop: Header=BB0_12 Depth=3
 ; CHECK-NEXT:    incq %rbp
 ; CHECK-NEXT:    jne LBB0_12
-; CHECK-NEXT:    jmp LBB0_5
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  LBB0_14: ## %exit
-; CHECK-NEXT:    ## in Loop: Header=BB0_10 Depth=2
-; CHECK-NEXT:    addq %rsi, %rbp
-; CHECK-NEXT:    incq %rdi
-; CHECK-NEXT:    decq %rsi
-; CHECK-NEXT:    addq $1001, %rdx ## imm = 0x3E9
-; CHECK-NEXT:    cmpq $-1000, %rbp ## imm = 0xFC18
-; CHECK-NEXT:    je LBB0_10
 ; CHECK-NEXT:  LBB0_5: ## %if.then
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rdi
 ; CHECK-NEXT:    callq _puts

+ 30 - 27
test/CodeGen/X86/speculative-load-hardening.ll

@@ -215,10 +215,7 @@ define void @test_basic_loop(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) nounwind sp
 ; X64-NEXT:    movl %esi, %ebp
 ; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    xorl %ebx, %ebx
-; X64-NEXT:    jmp .LBB2_3
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB2_6: # in Loop: Header=BB2_3 Depth=1
-; X64-NEXT:    cmovgeq %r15, %rax
 ; X64-NEXT:  .LBB2_3: # %l.header
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movslq (%r12), %rcx
@@ -237,8 +234,11 @@ define void @test_basic_loop(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) nounwind sp
 ; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    incl %ebx
 ; X64-NEXT:    cmpl %ebp, %ebx
-; X64-NEXT:    jl .LBB2_6
-; X64-NEXT:  # %bb.4:
+; X64-NEXT:    jge .LBB2_4
+; X64-NEXT:  # %bb.6: # in Loop: Header=BB2_3 Depth=1
+; X64-NEXT:    cmovgeq %r15, %rax
+; X64-NEXT:    jmp .LBB2_3
+; X64-NEXT:  .LBB2_4:
 ; X64-NEXT:    cmovlq %r15, %rax
 ; X64-NEXT:  .LBB2_5: # %exit
 ; X64-NEXT:    shlq $47, %rax
@@ -328,20 +328,12 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %pt
 ; X64-NEXT:    xorl %r13d, %r13d
 ; X64-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    testl %r15d, %r15d
-; X64-NEXT:    jg .LBB3_5
-; X64-NEXT:    jmp .LBB3_4
-; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_12:
-; X64-NEXT:    cmovgeq %rbp, %rax
-; X64-NEXT:    testl %r15d, %r15d
 ; X64-NEXT:    jle .LBB3_4
+; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_5: # %l2.header.preheader
 ; X64-NEXT:    cmovleq %rbp, %rax
 ; X64-NEXT:    xorl %r15d, %r15d
-; X64-NEXT:    jmp .LBB3_6
 ; X64-NEXT:    .p2align 4, 0x90
-; X64-NEXT:  .LBB3_11: # in Loop: Header=BB3_6 Depth=1
-; X64-NEXT:    cmovgeq %rbp, %rax
 ; X64-NEXT:  .LBB3_6: # %l2.header
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movslq (%rbx), %rcx
@@ -360,8 +352,12 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %pt
 ; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    incl %r15d
 ; X64-NEXT:    cmpl %r12d, %r15d
-; X64-NEXT:    jl .LBB3_11
-; X64-NEXT:  # %bb.7:
+; X64-NEXT:    jge .LBB3_7
+; X64-NEXT:  # %bb.11: # in Loop: Header=BB3_6 Depth=1
+; X64-NEXT:    cmovgeq %rbp, %rax
+; X64-NEXT:    jmp .LBB3_6
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB3_7:
 ; X64-NEXT:    cmovlq %rbp, %rax
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload
 ; X64-NEXT:    jmp .LBB3_8
@@ -385,8 +381,13 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %pt
 ; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    incl %r13d
 ; X64-NEXT:    cmpl %r15d, %r13d
-; X64-NEXT:    jl .LBB3_12
-; X64-NEXT:  # %bb.9:
+; X64-NEXT:    jge .LBB3_9
+; X64-NEXT:  # %bb.12:
+; X64-NEXT:    cmovgeq %rbp, %rax
+; X64-NEXT:    testl %r15d, %r15d
+; X64-NEXT:    jg .LBB3_5
+; X64-NEXT:    jmp .LBB3_4
+; X64-NEXT:  .LBB3_9:
 ; X64-NEXT:    cmovlq %rbp, %rax
 ; X64-NEXT:  .LBB3_10: # %exit
 ; X64-NEXT:    shlq $47, %rax
@@ -418,7 +419,17 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %pt
 ; X64-LFENCE-NEXT:    movl %esi, %r15d
 ; X64-LFENCE-NEXT:    lfence
 ; X64-LFENCE-NEXT:    xorl %r12d, %r12d
+; X64-LFENCE-NEXT:    jmp .LBB3_2
 ; X64-LFENCE-NEXT:    .p2align 4, 0x90
+; X64-LFENCE-NEXT:  .LBB3_5: # %l1.latch
+; X64-LFENCE-NEXT:    # in Loop: Header=BB3_2 Depth=1
+; X64-LFENCE-NEXT:    lfence
+; X64-LFENCE-NEXT:    movslq (%rbx), %rax
+; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    incl %r12d
+; X64-LFENCE-NEXT:    cmpl %r15d, %r12d
+; X64-LFENCE-NEXT:    jge .LBB3_6
 ; X64-LFENCE-NEXT:  .LBB3_2: # %l1.header
 ; X64-LFENCE-NEXT:    # =>This Loop Header: Depth=1
 ; X64-LFENCE-NEXT:    # Child Loop BB3_4 Depth 2
@@ -440,15 +451,7 @@ define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %pt
 ; X64-LFENCE-NEXT:    incl %ebp
 ; X64-LFENCE-NEXT:    cmpl %r13d, %ebp
 ; X64-LFENCE-NEXT:    jl .LBB3_4
-; X64-LFENCE-NEXT:  .LBB3_5: # %l1.latch
-; X64-LFENCE-NEXT:    # in Loop: Header=BB3_2 Depth=1
-; X64-LFENCE-NEXT:    lfence
-; X64-LFENCE-NEXT:    movslq (%rbx), %rax
-; X64-LFENCE-NEXT:    movl (%r14,%rax,4), %edi
-; X64-LFENCE-NEXT:    callq sink
-; X64-LFENCE-NEXT:    incl %r12d
-; X64-LFENCE-NEXT:    cmpl %r15d, %r12d
-; X64-LFENCE-NEXT:    jl .LBB3_2
+; X64-LFENCE-NEXT:    jmp .LBB3_5
 ; X64-LFENCE-NEXT:  .LBB3_6: # %exit
 ; X64-LFENCE-NEXT:    lfence
 ; X64-LFENCE-NEXT:    addq $8, %rsp

+ 3 - 3
test/CodeGen/X86/swifterror.ll

@@ -1,6 +1,6 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=CHECK-APPLE %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-apple-darwin -O0 | FileCheck --check-prefix=CHECK-O0 %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin | FileCheck --check-prefix=CHECK-i386 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=x86_64-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-APPLE %s
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=x86_64-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-O0 %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=i386-apple-darwin -disable-block-placement | FileCheck --check-prefix=CHECK-i386 %s
 
 declare i8* @malloc(i64)
 declare void @free(i8*)

+ 35 - 32
test/CodeGen/X86/tail-dup-merge-loop-headers.ll

@@ -12,14 +12,17 @@ define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0
 ; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
 ; CHECK-NEXT:    incq %rsi
 ; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    jne .LBB0_2
-; CHECK-NEXT:    jmp .LBB0_5
+; CHECK-NEXT:    je .LBB0_5
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %inner_loop_top
+; CHECK-NEXT:    # =>This Loop Header: Depth=1
+; CHECK-NEXT:    # Child Loop BB0_4 Depth 2
+; CHECK-NEXT:    cmpb $0, (%rsi)
+; CHECK-NEXT:    js .LBB0_3
 ; CHECK-NEXT:  .LBB0_4: # %inner_loop_latch
-; CHECK-NEXT:    # in Loop: Header=BB0_2 Depth=1
+; CHECK-NEXT:    # Parent Loop BB0_2 Depth=1
+; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
 ; CHECK-NEXT:    addq $2, %rsi
-; CHECK-NEXT:  .LBB0_2: # %inner_loop_top
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    cmpb $0, (%rsi)
 ; CHECK-NEXT:    jns .LBB0_4
 ; CHECK-NEXT:    jmp .LBB0_3
@@ -130,58 +133,58 @@ define i32 @loop_shared_header(i8* %exe, i32 %exesz, i32 %headsize, i32 %min, i3
 ; CHECK-NEXT:    testl %ebp, %ebp
 ; CHECK-NEXT:    je .LBB1_18
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB1_8: # %shared_loop_header
+; CHECK-NEXT:  .LBB1_9: # %shared_loop_header
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    testq %rbx, %rbx
 ; CHECK-NEXT:    jne .LBB1_27
-; CHECK-NEXT:  # %bb.9: # %inner_loop_body
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:  # %bb.10: # %inner_loop_body
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    testl %eax, %eax
-; CHECK-NEXT:    jns .LBB1_8
-; CHECK-NEXT:  # %bb.10: # %if.end96.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jns .LBB1_9
+; CHECK-NEXT:  # %bb.11: # %if.end96.i
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    cmpl $3, %ebp
 ; CHECK-NEXT:    jae .LBB1_22
-; CHECK-NEXT:  # %bb.11: # %if.end287.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:  # %bb.12: # %if.end287.i
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    xorl %esi, %esi
 ; CHECK-NEXT:    cmpl $1, %ebp
 ; CHECK-NEXT:    setne %dl
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne .LBB1_15
-; CHECK-NEXT:  # %bb.12: # %if.end308.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jne .LBB1_16
+; CHECK-NEXT:  # %bb.13: # %if.end308.i
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    je .LBB1_17
-; CHECK-NEXT:  # %bb.13: # %if.end335.i
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    je .LBB1_7
+; CHECK-NEXT:  # %bb.14: # %if.end335.i
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    testb %dl, %dl
 ; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    jne .LBB1_7
-; CHECK-NEXT:  # %bb.14: # %merge_other
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jne .LBB1_8
+; CHECK-NEXT:  # %bb.15: # %merge_other
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    jmp .LBB1_16
-; CHECK-NEXT:  .LBB1_15: # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jmp .LBB1_17
+; CHECK-NEXT:  .LBB1_16: # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    movb %dl, %sil
 ; CHECK-NEXT:    addl $3, %esi
-; CHECK-NEXT:  .LBB1_16: # %outer_loop_latch
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:  .LBB1_17: # %outer_loop_latch
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    # implicit-def: $dl
-; CHECK-NEXT:    jmp .LBB1_7
-; CHECK-NEXT:  .LBB1_17: # %merge_predecessor_split
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:    jmp .LBB1_8
+; CHECK-NEXT:  .LBB1_7: # %merge_predecessor_split
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    movb $32, %dl
 ; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:  .LBB1_7: # %outer_loop_latch
-; CHECK-NEXT:    # in Loop: Header=BB1_8 Depth=1
+; CHECK-NEXT:  .LBB1_8: # %outer_loop_latch
+; CHECK-NEXT:    # in Loop: Header=BB1_9 Depth=1
 ; CHECK-NEXT:    movzwl %si, %esi
 ; CHECK-NEXT:    decl %esi
 ; CHECK-NEXT:    movzwl %si, %esi
 ; CHECK-NEXT:    leaq 1(%rcx,%rsi), %rcx
 ; CHECK-NEXT:    testl %ebp, %ebp
-; CHECK-NEXT:    jne .LBB1_8
+; CHECK-NEXT:    jne .LBB1_9
 ; CHECK-NEXT:  .LBB1_18: # %while.cond.us1412.i
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al

+ 10 - 15
test/CodeGen/X86/tail-dup-repeat.ll

@@ -10,35 +10,30 @@
 define void @repeated_tail_dup(i1 %a1, i1 %a2, i32* %a4, i32* %a5, i8* %a6, i32 %a7) #0 align 2 {
 ; CHECK-LABEL: repeated_tail_dup:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_1: # %for.cond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    je .LBB0_3
-; CHECK-NEXT:  # %bb.2: # %land.lhs.true
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl $10, (%rdx)
-; CHECK-NEXT:    jmp .LBB0_6
 ; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_2: # %land.lhs.true
+; CHECK-NEXT:    movl $10, (%rdx)
+; CHECK-NEXT:  .LBB0_6: # %dup2
+; CHECK-NEXT:    movl $2, (%rcx)
+; CHECK-NEXT:    testl %r9d, %r9d
+; CHECK-NEXT:    jne .LBB0_8
+; CHECK-NEXT:  .LBB0_1: # %for.cond
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: # %if.end56
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testb $1, %sil
 ; CHECK-NEXT:    je .LBB0_5
 ; CHECK-NEXT:  # %bb.4: # %if.then64
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movb $1, (%r8)
 ; CHECK-NEXT:    testl %r9d, %r9d
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:    jmp .LBB0_8
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_5: # %if.end70
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl $12, (%rdx)
-; CHECK-NEXT:  .LBB0_6: # %dup2
-; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    movl $2, (%rcx)
-; CHECK-NEXT:    testl %r9d, %r9d
-; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:    jmp .LBB0_6
 ; CHECK-NEXT:  .LBB0_8: # %for.end
 ; CHECK-NEXT:    retq
 entry:

+ 35 - 32
test/CodeGen/X86/vector-shift-by-select-loop.ll

@@ -136,8 +136,17 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; SSE-NEXT:    jne .LBB0_4
 ; SSE-NEXT:  # %bb.5: # %middle.block
 ; SSE-NEXT:    cmpq %rax, %rdx
-; SSE-NEXT:    je .LBB0_9
+; SSE-NEXT:    jne .LBB0_6
+; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
+; SSE-NEXT:    retq
 ; SSE-NEXT:    .p2align 4, 0x90
+; SSE-NEXT:  .LBB0_8: # %for.body
+; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shll %cl, (%rdi,%rdx,4)
+; SSE-NEXT:    incq %rdx
+; SSE-NEXT:    cmpq %rdx, %rax
+; SSE-NEXT:    je .LBB0_9
 ; SSE-NEXT:  .LBB0_6: # %for.body
 ; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
 ; SSE-NEXT:    cmpb $0, (%rsi,%rdx)
@@ -146,15 +155,7 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; SSE-NEXT:  # %bb.7: # %for.body
 ; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
 ; SSE-NEXT:    movl %r8d, %ecx
-; SSE-NEXT:  .LBB0_8: # %for.body
-; SSE-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shll %cl, (%rdi,%rdx,4)
-; SSE-NEXT:    incq %rdx
-; SSE-NEXT:    cmpq %rdx, %rax
-; SSE-NEXT:    jne .LBB0_6
-; SSE-NEXT:  .LBB0_9: # %for.cond.cleanup
-; SSE-NEXT:    retq
+; SSE-NEXT:    jmp .LBB0_8
 ;
 ; AVX1-LABEL: vector_variable_shift_left_loop:
 ; AVX1:       # %bb.0: # %entry
@@ -258,8 +259,18 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX1-NEXT:    jne .LBB0_4
 ; AVX1-NEXT:  # %bb.5: # %middle.block
 ; AVX1-NEXT:    cmpq %rax, %rdx
-; AVX1-NEXT:    je .LBB0_9
+; AVX1-NEXT:    jne .LBB0_6
+; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
+; AVX1-NEXT:    vzeroupper
+; AVX1-NEXT:    retq
 ; AVX1-NEXT:    .p2align 4, 0x90
+; AVX1-NEXT:  .LBB0_8: # %for.body
+; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
+; AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; AVX1-NEXT:    shll %cl, (%rdi,%rdx,4)
+; AVX1-NEXT:    incq %rdx
+; AVX1-NEXT:    cmpq %rdx, %rax
+; AVX1-NEXT:    je .LBB0_9
 ; AVX1-NEXT:  .LBB0_6: # %for.body
 ; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX1-NEXT:    cmpb $0, (%rsi,%rdx)
@@ -268,16 +279,7 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX1-NEXT:  # %bb.7: # %for.body
 ; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
 ; AVX1-NEXT:    movl %r8d, %ecx
-; AVX1-NEXT:  .LBB0_8: # %for.body
-; AVX1-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
-; AVX1-NEXT:    shll %cl, (%rdi,%rdx,4)
-; AVX1-NEXT:    incq %rdx
-; AVX1-NEXT:    cmpq %rdx, %rax
-; AVX1-NEXT:    jne .LBB0_6
-; AVX1-NEXT:  .LBB0_9: # %for.cond.cleanup
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
+; AVX1-NEXT:    jmp .LBB0_8
 ;
 ; AVX2-LABEL: vector_variable_shift_left_loop:
 ; AVX2:       # %bb.0: # %entry
@@ -332,8 +334,18 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX2-NEXT:    jne .LBB0_4
 ; AVX2-NEXT:  # %bb.5: # %middle.block
 ; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    je .LBB0_9
+; AVX2-NEXT:    jne .LBB0_6
+; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ; AVX2-NEXT:    .p2align 4, 0x90
+; AVX2-NEXT:  .LBB0_8: # %for.body
+; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; AVX2-NEXT:    shll %cl, (%rdi,%rdx,4)
+; AVX2-NEXT:    incq %rdx
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    je .LBB0_9
 ; AVX2-NEXT:  .LBB0_6: # %for.body
 ; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; AVX2-NEXT:    cmpb $0, (%rsi,%rdx)
@@ -342,16 +354,7 @@ define void @vector_variable_shift_left_loop(i32* nocapture %arr, i8* nocapture
 ; AVX2-NEXT:  # %bb.7: # %for.body
 ; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
 ; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:  .LBB0_8: # %for.body
-; AVX2-NEXT:    # in Loop: Header=BB0_6 Depth=1
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; AVX2-NEXT:    shll %cl, (%rdi,%rdx,4)
-; AVX2-NEXT:    incq %rdx
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    jne .LBB0_6
-; AVX2-NEXT:  .LBB0_9: # %for.cond.cleanup
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX2-NEXT:    jmp .LBB0_8
 entry:
   %cmp12 = icmp sgt i32 %count, 0
   br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup

+ 8 - 8
test/CodeGen/X86/widen_arith-1.ll

@@ -8,9 +8,13 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %forbody
+; CHECK-NEXT:  .LBB0_1: # %forcond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -21,12 +25,8 @@ define void @update(<3 x i8>* %dst, <3 x i8>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    pshufb %xmm1, %xmm2
 ; CHECK-NEXT:    pextrw $0, %xmm2, (%ecx,%eax,4)
 ; CHECK-NEXT:    incl (%esp)
-; CHECK-NEXT:  .LBB0_1: # %forcond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jl .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %afterfor
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %afterfor
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:

+ 8 - 8
test/CodeGen/X86/widen_arith-2.ll

@@ -10,9 +10,13 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4]
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %forbody
+; CHECK-NEXT:  .LBB0_1: # %forcond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl (%esp), %eax
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl (%esp), %eax
 ; CHECK-NEXT:    leal (,%eax,8), %ecx
@@ -27,12 +31,8 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; CHECK-NEXT:    packuswb %xmm0, %xmm2
 ; CHECK-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; CHECK-NEXT:    incl (%esp)
-; CHECK-NEXT:  .LBB0_1: # %forcond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl (%esp), %eax
-; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    jl .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %afterfor
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %afterfor
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    retl
 entry:

+ 8 - 8
test/CodeGen/X86/widen_arith-3.ll

@@ -18,9 +18,13 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movw $1, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %forbody
+; CHECK-NEXT:  .LBB0_1: # %forcond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl 16(%ebp), %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl 12(%ebp), %edx
@@ -31,12 +35,8 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    pshufb %xmm1, %xmm2
 ; CHECK-NEXT:    movd %xmm2, (%ecx,%eax,8)
 ; CHECK-NEXT:    incl {{[0-9]+}}(%esp)
-; CHECK-NEXT:  .LBB0_1: # %forcond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    cmpl 16(%ebp), %eax
-; CHECK-NEXT:    jl .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %afterfor
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %afterfor
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl

+ 16 - 16
test/CodeGen/X86/widen_arith-4.ll

@@ -16,9 +16,13 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
 ; SSE2-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
-; SSE2-NEXT:    jmp .LBB0_1
 ; SSE2-NEXT:    .p2align 4, 0x90
-; SSE2-NEXT:  .LBB0_2: # %forbody
+; SSE2-NEXT:  .LBB0_1: # %forcond
+; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    jge .LBB0_3
+; SSE2-NEXT:  # %bb.2: # %forbody
 ; SSE2-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; SSE2-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
 ; SSE2-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
@@ -31,12 +35,8 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
 ; SSE2-NEXT:    pextrw $4, %xmm2, %edx
 ; SSE2-NEXT:    movw %dx, 8(%rcx,%rax)
 ; SSE2-NEXT:    incl -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:  .LBB0_1: # %forcond
-; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    jl .LBB0_2
-; SSE2-NEXT:  # %bb.3: # %afterfor
+; SSE2-NEXT:    jmp .LBB0_1
+; SSE2-NEXT:  .LBB0_3: # %afterfor
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: update:
@@ -49,9 +49,13 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
 ; SSE41-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
 ; SSE41-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
-; SSE41-NEXT:    jmp .LBB0_1
 ; SSE41-NEXT:    .p2align 4, 0x90
-; SSE41-NEXT:  .LBB0_2: # %forbody
+; SSE41-NEXT:  .LBB0_1: # %forcond
+; SSE41-NEXT:    # =>This Inner Loop Header: Depth=1
+; SSE41-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
+; SSE41-NEXT:    jge .LBB0_3
+; SSE41-NEXT:  # %bb.2: # %forbody
 ; SSE41-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; SSE41-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
 ; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
@@ -66,12 +70,8 @@ define void @update(<5 x i16>* %dst, <5 x i16>* %src, i32 %n) nounwind {
 ; SSE41-NEXT:    pextrw $4, %xmm1, 8(%rcx,%rax)
 ; SSE41-NEXT:    movq %xmm2, (%rcx,%rax)
 ; SSE41-NEXT:    incl -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:  .LBB0_1: # %forcond
-; SSE41-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE41-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; SSE41-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
-; SSE41-NEXT:    jl .LBB0_2
-; SSE41-NEXT:  # %bb.3: # %afterfor
+; SSE41-NEXT:    jmp .LBB0_1
+; SSE41-NEXT:  .LBB0_3: # %afterfor
 ; SSE41-NEXT:    retq
 entry:
 	%dst.addr = alloca <5 x i16>*

+ 8 - 8
test/CodeGen/X86/widen_arith-5.ll

@@ -14,9 +14,13 @@ define void @update(<3 x i32>* %dst, <3 x i32>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <3,3,3,u>
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %forbody
+; CHECK-NEXT:  .LBB0_1: # %forcond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movslq -{{[0-9]+}}(%rsp), %rax
 ; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
@@ -28,12 +32,8 @@ define void @update(<3 x i32>* %dst, <3 x i32>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    pextrd $2, %xmm1, 8(%rcx,%rax)
 ; CHECK-NEXT:    movq %xmm1, (%rcx,%rax)
 ; CHECK-NEXT:    incl -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:  .LBB0_1: # %forcond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    cmpl -{{[0-9]+}}(%rsp), %eax
-; CHECK-NEXT:    jl .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %afterfor
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %afterfor
 ; CHECK-NEXT:    retq
 entry:
 	%dst.addr = alloca <3 x i32>*

+ 8 - 8
test/CodeGen/X86/widen_arith-6.ll

@@ -15,9 +15,13 @@ define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1.97604004E+3,1.97604004E+3,1.97604004E+3,u>
-; CHECK-NEXT:    jmp .LBB0_1
 ; CHECK-NEXT:    .p2align 4, 0x90
-; CHECK-NEXT:  .LBB0_2: # %forbody
+; CHECK-NEXT:  .LBB0_1: # %forcond
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl 16(%ebp), %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # %bb.2: # %forbody
 ; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl 8(%ebp), %ecx
@@ -30,12 +34,8 @@ define void @update(<3 x float>* %dst, <3 x float>* %src, i32 %n) nounwind {
 ; CHECK-NEXT:    extractps $1, %xmm1, 4(%ecx,%eax)
 ; CHECK-NEXT:    movss %xmm1, (%ecx,%eax)
 ; CHECK-NEXT:    incl {{[0-9]+}}(%esp)
-; CHECK-NEXT:  .LBB0_1: # %forcond
-; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    cmpl 16(%ebp), %eax
-; CHECK-NEXT:    jl .LBB0_2
-; CHECK-NEXT:  # %bb.3: # %afterfor
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_3: # %afterfor
 ; CHECK-NEXT:    movl %ebp, %esp
 ; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    retl

+ 16 - 16
test/CodeGen/X86/widen_cast-4.ll

@@ -11,9 +11,13 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; NARROW-NEXT:    movl $0, (%esp)
 ; NARROW-NEXT:    pcmpeqd %xmm0, %xmm0
 ; NARROW-NEXT:    movdqa {{.*#+}} xmm1 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; NARROW-NEXT:    jmp .LBB0_1
 ; NARROW-NEXT:    .p2align 4, 0x90
-; NARROW-NEXT:  .LBB0_2: # %forbody
+; NARROW-NEXT:  .LBB0_1: # %forcond
+; NARROW-NEXT:    # =>This Inner Loop Header: Depth=1
+; NARROW-NEXT:    movl (%esp), %eax
+; NARROW-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; NARROW-NEXT:    jge .LBB0_3
+; NARROW-NEXT:  # %bb.2: # %forbody
 ; NARROW-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; NARROW-NEXT:    movl (%esp), %eax
 ; NARROW-NEXT:    leal (,%eax,8), %ecx
@@ -30,12 +34,8 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; NARROW-NEXT:    pshufb %xmm1, %xmm2
 ; NARROW-NEXT:    movq %xmm2, (%edx,%eax,8)
 ; NARROW-NEXT:    incl (%esp)
-; NARROW-NEXT:  .LBB0_1: # %forcond
-; NARROW-NEXT:    # =>This Inner Loop Header: Depth=1
-; NARROW-NEXT:    movl (%esp), %eax
-; NARROW-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT:    jl .LBB0_2
-; NARROW-NEXT:  # %bb.3: # %afterfor
+; NARROW-NEXT:    jmp .LBB0_1
+; NARROW-NEXT:  .LBB0_3: # %afterfor
 ; NARROW-NEXT:    addl $12, %esp
 ; NARROW-NEXT:    retl
 ;
@@ -46,9 +46,13 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:    pcmpeqd %xmm0, %xmm0
 ; WIDE-NEXT:    movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
 ; WIDE-NEXT:    movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; WIDE-NEXT:    jmp .LBB0_1
 ; WIDE-NEXT:    .p2align 4, 0x90
-; WIDE-NEXT:  .LBB0_2: # %forbody
+; WIDE-NEXT:  .LBB0_1: # %forcond
+; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
+; WIDE-NEXT:    movl (%esp), %eax
+; WIDE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; WIDE-NEXT:    jge .LBB0_3
+; WIDE-NEXT:  # %bb.2: # %forbody
 ; WIDE-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; WIDE-NEXT:    movl (%esp), %eax
 ; WIDE-NEXT:    leal (,%eax,8), %ecx
@@ -65,12 +69,8 @@ define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
 ; WIDE-NEXT:    psubb %xmm2, %xmm3
 ; WIDE-NEXT:    movq %xmm3, (%edx,%eax,8)
 ; WIDE-NEXT:    incl (%esp)
-; WIDE-NEXT:  .LBB0_1: # %forcond
-; WIDE-NEXT:    # =>This Inner Loop Header: Depth=1
-; WIDE-NEXT:    movl (%esp), %eax
-; WIDE-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
-; WIDE-NEXT:    jl .LBB0_2
-; WIDE-NEXT:  # %bb.3: # %afterfor
+; WIDE-NEXT:    jmp .LBB0_1
+; WIDE-NEXT:  .LBB0_3: # %afterfor
 ; WIDE-NEXT:    addl $12, %esp
 ; WIDE-NEXT:    retl
 entry:

+ 1 - 1
test/CodeGen/X86/x86-cmov-converter.ll

@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs -disable-block-placement < %s | FileCheck -allow-deprecated-dag-overlap %s
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization transform CMOV

+ 6 - 6
test/DebugInfo/X86/PR37234.ll

@@ -21,18 +21,18 @@
 ; CHECK-LABEL: # %bb.{{.*}}:
 ; CHECK:        #DEBUG_VALUE: main:aa <- 0
 ; CHECK: 	#DEBUG_VALUE: main:aa <- $[[REG:[0-9a-z]+]]
-; CHECK: 	jmp	.LBB0_1
-; CHECK: .LBB0_2:
+; CHECK: .LBB0_1:
+; CHECK:        #DEBUG_VALUE: main:aa <- $[[REG]]
+; CHECK:        je      .LBB0_4
+; CHECK: # %bb.{{.*}}:
 ; CHECK:        #DEBUG_VALUE: main:aa <- $[[REG]]
 ; CHECK:        jne     .LBB0_1
 ; CHECK: # %bb.{{.*}}:
 ; CHECK:        #DEBUG_VALUE: main:aa <- $[[REG]]
 ; CHECK:        incl    %[[REG]]
 ; CHECK:        #DEBUG_VALUE: main:aa <- $[[REG]]
-; CHECK: .LBB0_1:
-; CHECK: 	#DEBUG_VALUE: main:aa <- $[[REG]]
-; CHECK:        jne     .LBB0_2
-; CHECK: # %bb.{{.*}}:
+; CHECK:        jmp     .LBB0_1
+; CHECK: .LBB0_4:
 ; CHECK: 	#DEBUG_VALUE: main:aa <- $[[REG]]
 ; CHECK: 	retq
 

+ 7 - 6
test/DebugInfo/X86/dbg-value-transfer-order.ll

@@ -24,6 +24,12 @@
 ; with the Orders insertion point vector.
 
 ; CHECK-LABEL: f: # @f
+; CHECK: .LBB0_4:
+;        Check that this DEBUG_VALUE comes before the left shift.
+; CHECK:         #DEBUG_VALUE: bit_offset <- $ecx
+; CHECK:         .cv_loc 0 1 8 28                # t.c:8:28
+; CHECK:         movl    $1, %[[reg:[^ ]*]]
+; CHECK:         shll    %cl, %[[reg]]
 ; CHECK: .LBB0_2:                                # %while.body
 ; CHECK:         movl    $32, %ecx
 ; CHECK:         testl   {{.*}}
@@ -31,12 +37,7 @@
 ; CHECK: # %bb.3:                                 # %if.then
 ; CHECK:         callq   if_then
 ; CHECK:         movl    %eax, %ecx
-; CHECK: .LBB0_4:                                # %if.end
-;        Check that this DEBUG_VALUE comes before the left shift.
-; CHECK:         #DEBUG_VALUE: bit_offset <- $ecx
-; CHECK:         .cv_loc 0 1 8 28                # t.c:8:28
-; CHECK:         movl    $1, %[[reg:[^ ]*]]
-; CHECK:         shll    %cl, %[[reg]]
+; CHECK:         jmp     .LBB0_4
 
 ; ModuleID = 't.c'
 source_filename = "t.c"