7 ani în urmă · c07df42b1a
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -282,9 +282,12 @@ namespace {
 
				 class LoopVectorizationLegality;
			
 
				 class LoopVectorizationCostModel;
			
 
				 class LoopVectorizationRequirements;
			
 
				+class VPBlendRecipe;
			
 
				 class VPInterleaveRecipe;
			
 
				 class VPReplicateRecipe;
			
 
				 class VPWidenIntOrFpInductionRecipe;
			
 
				+class VPWidenRecipe;
			
 
				+class VPWidenMemoryInstructionRecipe;
			
 
				 
			
 
				 } // end anonymous namespace
			
 
				 
			
@@ -452,6 +455,10 @@ public:
 
				   /// mask for the block BB.
			
 
				   VectorParts createBlockInMask(BasicBlock *BB);
			
 
				 
			
 
				+  /// A helper function that computes the predicate of the edge between SRC
			
 
				+  /// and DST.
			
 
				+  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
			
 
				+
			
 
				   /// Vectorize a single PHINode in a block. This method handles the induction
			
 
				   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
			
 
				   /// arbitrary length vectors.
			
@@ -504,6 +511,13 @@ public:
 
				   /// Try to vectorize the interleaved access group that \p Instr belongs to.
			
 
				   void vectorizeInterleaveGroup(Instruction *Instr);
			
 
				 
			
 
				+  /// Vectorize Load and Store instructions,
			
 
				+  virtual void vectorizeMemoryInstruction(Instruction *Instr);
			
 
				+
			
 
				+  /// \brief Set the debug location in the builder using the debug location in
			
 
				+  /// the instruction.
			
 
				+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
			
 
				+
			
 
				 protected:
			
 
				   friend class LoopVectorizationPlanner;
			
 
				 
			
@@ -556,17 +570,10 @@ protected:
 
				   /// represented as.
			
 
				   void truncateToMinimalBitwidths();
			
 
				 
			
 
				-  /// A helper function that computes the predicate of the edge between SRC
			
 
				-  /// and DST.
			
 
				-  VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
			
 
				-
			
 
				   /// Insert the new loop to the loop hierarchy and pass manager
			
 
				   /// and update the analysis passes.
			
 
				   void updateAnalysis();
			
 
				 
			
 
				-  /// Vectorize Load and Store instructions,
			
 
				-  virtual void vectorizeMemoryInstruction(Instruction *Instr);
			
 
				-
			
 
				   /// Create a broadcast instruction. This method generates a broadcast
			
 
				   /// instruction (shuffle) for loop invariant values and for the induction
			
 
				   /// value. If this is the induction variable then we extend it to N, N+1, ...
			
@@ -647,10 +654,6 @@ protected:
 
				   /// vector of instructions.
			
 
				   void addMetadata(ArrayRef<Value *> To, Instruction *From);
			
 
				 
			
 
				-  /// \brief Set the debug location in the builder using the debug location in
			
 
				-  /// the instruction.
			
 
				-  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
			
 
				-
			
 
				   /// The original loop.
			
 
				   Loop *OrigLoop;
			
 
				 
			
@@ -2295,6 +2298,11 @@ private:
 
				   /// to \p Range.End.
			
 
				   VPInterleaveRecipe *tryToInterleaveMemory(Instruction *I, VFRange &Range);
			
 
				 
			
 
				+  // Check if \I is a memory instruction to be widened for \p Range.Start and
			
 
				+  // potentially masked.
			
 
				+  VPWidenMemoryInstructionRecipe *tryToWidenMemory(Instruction *I,
			
 
				+                                                   VFRange &Range);
			
 
				+
			
 
				   /// Check if an induction recipe should be constructed for \I within the given
			
 
				   /// VF \p Range. If so build and return it. If not, return null. \p Range.End
			
 
				   /// may be decreased to ensure same decision from \p Range.Start to
			
@@ -2302,6 +2310,11 @@ private:
 
				   VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
			
 
				                                                         VFRange &Range);
			
 
				 
			
 
				+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
			
 
				+  /// a sequence of select instructions as the vectorizer currently performs
			
 
				+  /// full if-conversion.
			
 
				+  VPBlendRecipe *tryToBlend(Instruction *I);
			
 
				+
			
 
				   /// Check if \p I can be widened within the given VF \p Range. If \p I can be
			
 
				   /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
			
 
				   /// extended to include \p I or else build a new VPWidenRecipe for it and
			
@@ -4497,77 +4510,6 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
 
				   } while (Changed);
			
 
				 }
			
 
				 
			
 
				-InnerLoopVectorizer::VectorParts
			
 
				-InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
			
 
				-  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
			
 
				-
			
 
				-  // Look for cached value.
			
 
				-  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
			
 
				-  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
			
 
				-  if (ECEntryIt != EdgeMaskCache.end())
			
 
				-    return ECEntryIt->second;
			
 
				-
			
 
				-  VectorParts SrcMask = createBlockInMask(Src);
			
 
				-
			
 
				-  // The terminator has to be a branch inst!
			
 
				-  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
			
 
				-  assert(BI && "Unexpected terminator found");
			
 
				-
			
 
				-  if (!BI->isConditional())
			
 
				-    return EdgeMaskCache[Edge] = SrcMask;
			
 
				-
			
 
				-  VectorParts EdgeMask(UF);
			
 
				-  for (unsigned Part = 0; Part < UF; ++Part) {
			
 
				-    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
			
 
				-    if (BI->getSuccessor(0) != Dst)
			
 
				-      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
			
 
				-
			
 
				-    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
			
 
				-      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
			
 
				-
			
 
				-    EdgeMask[Part] = EdgeMaskPart;
			
 
				-  }
			
 
				-
			
 
				-  return EdgeMaskCache[Edge] = EdgeMask;
			
 
				-}
			
 
				-
			
 
				-InnerLoopVectorizer::VectorParts
			
 
				-InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
			
 
				-  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
			
 
				-
			
 
				-  // Look for cached value.
			
 
				-  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
			
 
				-  if (BCEntryIt != BlockMaskCache.end())
			
 
				-    return BCEntryIt->second;
			
 
				-
			
 
				-  // All-one mask is modelled as no-mask following the convention for masked
			
 
				-  // load/store/gather/scatter. Initialize BlockMask to no-mask.
			
 
				-  VectorParts BlockMask(UF);
			
 
				-  for (unsigned Part = 0; Part < UF; ++Part)
			
 
				-    BlockMask[Part] = nullptr;
			
 
				-
			
 
				-  // Loop incoming mask is all-one.
			
 
				-  if (OrigLoop->getHeader() == BB)
			
 
				-    return BlockMaskCache[BB] = BlockMask;
			
 
				-
			
 
				-  // This is the block mask. We OR all incoming edges.
			
 
				-  for (auto *Predecessor : predecessors(BB)) {
			
 
				-    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
			
 
				-    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
			
 
				-      return BlockMaskCache[BB] = EdgeMask;
			
 
				-
			
 
				-    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
			
 
				-      BlockMask = EdgeMask;
			
 
				-      continue;
			
 
				-    }
			
 
				-
			
 
				-    for (unsigned Part = 0; Part < UF; ++Part)
			
 
				-      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
			
 
				-  }
			
 
				-
			
 
				-  return BlockMaskCache[BB] = BlockMask;
			
 
				-}
			
 
				-
			
 
				 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
			
 
				                                               unsigned VF) {
			
 
				   PHINode *P = cast<PHINode>(PN);
			
@@ -4588,43 +4530,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
 
				   }
			
 
				 
			
 
				   setDebugLocFromInst(Builder, P);
			
 
				-  // Check for PHI nodes that are lowered to vector selects.
			
 
				-  if (P->getParent() != OrigLoop->getHeader()) {
			
 
				-    // We know that all PHIs in non-header blocks are converted into
			
 
				-    // selects, so we don't have to worry about the insertion order and we
			
 
				-    // can just use the builder.
			
 
				-    // At this point we generate the predication tree. There may be
			
 
				-    // duplications since this is a simple recursive scan, but future
			
 
				-    // optimizations will clean it up.
			
 
				-
			
 
				-    unsigned NumIncoming = P->getNumIncomingValues();
			
 
				-
			
 
				-    // Generate a sequence of selects of the form:
			
 
				-    // SELECT(Mask3, In3,
			
 
				-    //      SELECT(Mask2, In2,
			
 
				-    //                   ( ...)))
			
 
				-    VectorParts Entry(UF);
			
 
				-    for (unsigned In = 0; In < NumIncoming; In++) {
			
 
				-      VectorParts Cond =
			
 
				-          createEdgeMask(P->getIncomingBlock(In), P->getParent());
			
 
				-
			
 
				-      for (unsigned Part = 0; Part < UF; ++Part) {
			
 
				-        Value *In0 = getOrCreateVectorValue(P->getIncomingValue(In), Part);
			
 
				-        assert((Cond[Part] || NumIncoming == 1) &&
			
 
				-               "Multiple predecessors with one predecessor having a full mask");
			
 
				-        if (In == 0)
			
 
				-          Entry[Part] = In0; // Initialize with the first incoming value.
			
 
				-        else
			
 
				-          // Select between the current value and the previous incoming edge
			
 
				-          // based on the incoming mask.
			
 
				-          Entry[Part] = Builder.CreateSelect(Cond[Part], In0, Entry[Part],
			
 
				-                                             "predphi");
			
 
				-      }
			
 
				-    }
			
 
				-    for (unsigned Part = 0; Part < UF; ++Part)
			
 
				-      VectorLoopValueMap.setVectorValue(P, Part, Entry[Part]);
			
 
				-    return;
			
 
				-  }
			
 
				 
			
 
				   // This PHINode must be an induction variable.
			
 
				   // Make sure that we know about it.
			
@@ -4848,10 +4753,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
				     break;
			
 
				   }
			
 
				 
			
 
				-  case Instruction::Store:
			
 
				-  case Instruction::Load:
			
 
				-    vectorizeMemoryInstruction(&I);
			
 
				-    break;
			
 
				   case Instruction::ZExt:
			
 
				   case Instruction::SExt:
			
 
				   case Instruction::FPToUI:
			
@@ -4956,7 +4857,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
				   }
			
 
				 
			
 
				   default:
			
 
				-    // All other instructions are scalarized.
			
 
				+    // This instruction is not vectorized by simple widening.
			
 
				     DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
			
 
				     llvm_unreachable("Unhandled instruction!");
			
 
				   } // end of switch.
			
@@ -7827,6 +7728,82 @@ public:
 
				   }
			
 
				 };
			
 
				 
			
 
				+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
			
 
				+/// instructions.
			
 
				+class VPBlendRecipe : public VPRecipeBase {
			
 
				+private:
			
 
				+  PHINode *Phi;
			
 
				+
			
 
				+public:
			
 
				+  VPBlendRecipe(PHINode *Phi) : VPRecipeBase(VPBlendSC), Phi(Phi) {}
			
 
				+
			
 
				+  /// Method to support type inquiry through isa, cast, and dyn_cast.
			
 
				+  static inline bool classof(const VPRecipeBase *V) {
			
 
				+    return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
			
 
				+  }
			
 
				+
			
 
				+  /// Generate the phi/select nodes.
			
 
				+  void execute(VPTransformState &State) override {
			
 
				+    State.ILV->setDebugLocFromInst(State.Builder, Phi);
			
 
				+    // We know that all PHIs in non-header blocks are converted into
			
 
				+    // selects, so we don't have to worry about the insertion order and we
			
 
				+    // can just use the builder.
			
 
				+    // At this point we generate the predication tree. There may be
			
 
				+    // duplications since this is a simple recursive scan, but future
			
 
				+    // optimizations will clean it up.
			
 
				+
			
 
				+    unsigned NumIncoming = Phi->getNumIncomingValues();
			
 
				+
			
 
				+    // Generate a sequence of selects of the form:
			
 
				+    // SELECT(Mask3, In3,
			
 
				+    //      SELECT(Mask2, In2,
			
 
				+    //                   ( ...)))
			
 
				+    InnerLoopVectorizer::VectorParts Entry(State.UF);
			
 
				+    for (unsigned In = 0; In < NumIncoming; In++) {
			
 
				+      InnerLoopVectorizer::VectorParts Cond =
			
 
				+        State.ILV->createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent());
			
 
				+
			
 
				+      for (unsigned Part = 0; Part < State.UF; ++Part) {
			
 
				+        Value *In0 =
			
 
				+          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
			
 
				+        assert((Cond[Part] || NumIncoming == 1) &&
			
 
				+               "Multiple predecessors with one predecessor having a full mask");
			
 
				+        if (In == 0)
			
 
				+          Entry[Part] = In0; // Initialize with the first incoming value.
			
 
				+        else
			
 
				+          // Select between the current value and the previous incoming edge
			
 
				+          // based on the incoming mask.
			
 
				+          Entry[Part] = State.Builder.CreateSelect(Cond[Part], In0, Entry[Part],
			
 
				+                                                   "predphi");
			
 
				+      }
			
 
				+    }
			
 
				+    for (unsigned Part = 0; Part < State.UF; ++Part)
			
 
				+      State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
			
 
				+  }
			
 
				+
			
 
				+  /// Print the recipe.
			
 
				+  void print(raw_ostream &O, const Twine &Indent) const override {
			
 
				+    O << " +\n" << Indent << "\"BLEND ";
			
 
				+    Phi->printAsOperand(O, false);
			
 
				+    O << " =";
			
 
				+    if (Phi->getNumIncomingValues() == 1) {
			
 
				+      // Not a User of any mask: not really blending, this is a
			
 
				+      // single-predecessor phi.
			
 
				+      O << " ";
			
 
				+      Phi->getIncomingValue(0)->printAsOperand(O, false);
			
 
				+    } else {
			
 
				+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I < E; ++I) {
			
 
				+        O << " ";
			
 
				+        Phi->getIncomingValue(I)->printAsOperand(O, false);
			
 
				+        O << "/";
			
 
				+        Phi->getIncomingBlock(I)->printAsOperand(O, false);
			
 
				+      }
			
 
				+    }
			
 
				+    O << "\\l\"";
			
 
				+
			
 
				+  }
			
 
				+};
			
 
				+
			
 
				 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
			
 
				 /// or stores into one wide load/store and shuffles.
			
 
				 class VPInterleaveRecipe : public VPRecipeBase {
			
@@ -7970,6 +7947,31 @@ public:
 
				   }
			
 
				 };
			
 
				 
			
 
				+/// A Recipe for widening load/store operations.
			
 
				+class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
			
 
				+private:
			
 
				+  Instruction &Instr;
			
 
				+
			
 
				+public:
			
 
				+  VPWidenMemoryInstructionRecipe(Instruction &Instr)
			
 
				+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr) {}
			
 
				+
			
 
				+  /// Method to support type inquiry through isa, cast, and dyn_cast.
			
 
				+  static inline bool classof(const VPRecipeBase *V) {
			
 
				+    return V->getVPRecipeID() == VPRecipeBase::VPWidenMemoryInstructionSC;
			
 
				+  }
			
 
				+
			
 
				+  /// Generate the wide load/store.
			
 
				+  void execute(VPTransformState &State) override {
			
 
				+    State.ILV->vectorizeMemoryInstruction(&Instr);
			
 
				+  }
			
 
				+
			
 
				+  /// Print the recipe.
			
 
				+  void print(raw_ostream &O, const Twine &Indent) const override {
			
 
				+    O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
			
 
				+    O << "\\l\"";
			
 
				+  }
			
 
				+};
			
 
				 } // end anonymous namespace
			
 
				 
			
 
				 bool LoopVectorizationPlanner::getDecisionAndClampRange(
			
@@ -7999,6 +8001,77 @@ void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
 
				   }
			
 
				 }
			
 
				 
			
 
				+InnerLoopVectorizer::VectorParts
			
 
				+InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
			
 
				+  assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
			
 
				+
			
 
				+  // Look for cached value.
			
 
				+  std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
			
 
				+  EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
			
 
				+  if (ECEntryIt != EdgeMaskCache.end())
			
 
				+    return ECEntryIt->second;
			
 
				+
			
 
				+  VectorParts SrcMask = createBlockInMask(Src);
			
 
				+
			
 
				+  // The terminator has to be a branch inst!
			
 
				+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
			
 
				+  assert(BI && "Unexpected terminator found");
			
 
				+
			
 
				+  if (!BI->isConditional())
			
 
				+    return EdgeMaskCache[Edge] = SrcMask;
			
 
				+
			
 
				+  VectorParts EdgeMask(UF);
			
 
				+  for (unsigned Part = 0; Part < UF; ++Part) {
			
 
				+    auto *EdgeMaskPart = getOrCreateVectorValue(BI->getCondition(), Part);
			
 
				+    if (BI->getSuccessor(0) != Dst)
			
 
				+      EdgeMaskPart = Builder.CreateNot(EdgeMaskPart);
			
 
				+
			
 
				+    if (SrcMask[Part]) // Otherwise block in-mask is all-one, no need to AND.
			
 
				+      EdgeMaskPart = Builder.CreateAnd(EdgeMaskPart, SrcMask[Part]);
			
 
				+
			
 
				+    EdgeMask[Part] = EdgeMaskPart;
			
 
				+  }
			
 
				+
			
 
				+  return EdgeMaskCache[Edge] = EdgeMask;
			
 
				+}
			
 
				+
			
 
				+InnerLoopVectorizer::VectorParts
			
 
				+InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
			
 
				+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
			
 
				+
			
 
				+  // Look for cached value.
			
 
				+  BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
			
 
				+  if (BCEntryIt != BlockMaskCache.end())
			
 
				+    return BCEntryIt->second;
			
 
				+
			
 
				+  // All-one mask is modelled as no-mask following the convention for masked
			
 
				+  // load/store/gather/scatter. Initialize BlockMask to no-mask.
			
 
				+  VectorParts BlockMask(UF);
			
 
				+  for (unsigned Part = 0; Part < UF; ++Part)
			
 
				+    BlockMask[Part] = nullptr;
			
 
				+
			
 
				+  // Loop incoming mask is all-one.
			
 
				+  if (OrigLoop->getHeader() == BB)
			
 
				+    return BlockMaskCache[BB] = BlockMask;
			
 
				+
			
 
				+  // This is the block mask. We OR all incoming edges.
			
 
				+  for (auto *Predecessor : predecessors(BB)) {
			
 
				+    VectorParts EdgeMask = createEdgeMask(Predecessor, BB);
			
 
				+    if (!EdgeMask[0]) // Mask of predecessor is all-one so mask of block is too.
			
 
				+      return BlockMaskCache[BB] = EdgeMask;
			
 
				+
			
 
				+    if (!BlockMask[0]) { // BlockMask has its initialized nullptr value.
			
 
				+      BlockMask = EdgeMask;
			
 
				+      continue;
			
 
				+    }
			
 
				+
			
 
				+    for (unsigned Part = 0; Part < UF; ++Part)
			
 
				+      BlockMask[Part] = Builder.CreateOr(BlockMask[Part], EdgeMask[Part]);
			
 
				+  }
			
 
				+
			
 
				+  return BlockMaskCache[BB] = BlockMask;
			
 
				+}
			
 
				+
			
 
				 VPInterleaveRecipe *
			
 
				 LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
			
 
				                                                 VFRange &Range) {
			
@@ -8026,6 +8099,32 @@ LoopVectorizationPlanner::tryToInterleaveMemory(Instruction *I,
 
				   return new VPInterleaveRecipe(IG);
			
 
				 }
			
 
				 
			
 
				+VPWidenMemoryInstructionRecipe *
			
 
				+LoopVectorizationPlanner::tryToWidenMemory(Instruction *I, VFRange &Range) {
			
 
				+  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
			
 
				+    return nullptr;
			
 
				+
			
 
				+  auto willWiden = [&](unsigned VF) -> bool {
			
 
				+    if (VF == 1)
			
 
				+      return false;
			
 
				+    if (CM.isScalarAfterVectorization(I, VF) ||
			
 
				+        CM.isProfitableToScalarize(I, VF))
			
 
				+      return false;
			
 
				+    LoopVectorizationCostModel::InstWidening Decision =
			
 
				+        CM.getWideningDecision(I, VF);
			
 
				+    assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
			
 
				+           "CM decision should be taken at this point.");
			
 
				+    assert(Decision != LoopVectorizationCostModel::CM_Interleave &&
			
 
				+           "Interleave memory opportunity should be caught earlier.");
			
 
				+    return Decision != LoopVectorizationCostModel::CM_Scalarize;
			
 
				+  };
			
 
				+
			
 
				+  if (!getDecisionAndClampRange(willWiden, Range))
			
 
				+    return nullptr;
			
 
				+
			
 
				+  return new VPWidenMemoryInstructionRecipe(*I);
			
 
				+}
			
 
				+
			
 
				 VPWidenIntOrFpInductionRecipe *
			
 
				 LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
			
 
				                                                  VFRange &Range) {
			
@@ -8060,6 +8159,14 @@ LoopVectorizationPlanner::tryToOptimizeInduction(Instruction *I,
 
				   return nullptr;
			
 
				 }
			
 
				 
			
 
				+VPBlendRecipe *LoopVectorizationPlanner::tryToBlend(Instruction *I) {
			
 
				+  PHINode *Phi = dyn_cast<PHINode>(I);
			
 
				+  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
			
 
				+    return nullptr;
			
 
				+
			
 
				+  return new VPBlendRecipe(Phi);
			
 
				+}
			
 
				+
			
 
				 bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
			
 
				                                           VFRange &Range) {
			
 
				   if (Legal->isScalarWithPredication(I))
			
@@ -8313,11 +8420,21 @@ std::unique_ptr<VPlan> LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
 
				         continue;
			
 
				       }
			
 
				 
			
 
				+      // Check if Instr is a memory operation that should be widened.
			
 
				+      if ((Recipe = tryToWidenMemory(Instr, Range))) {
			
 
				+        VPBB->appendRecipe(Recipe);
			
 
				+        continue;
			
 
				+      }
			
 
				+
			
 
				       // Check if Instr should form some PHI recipe.
			
 
				       if ((Recipe = tryToOptimizeInduction(Instr, Range))) {
			
 
				         VPBB->appendRecipe(Recipe);
			
 
				         continue;
			
 
				       }
			
 
				+      if ((Recipe = tryToBlend(Instr))) {
			
 
				+        VPBB->appendRecipe(Recipe);
			
 
				+        continue;
			
 
				+      }
			
 
				       if (PHINode *Phi = dyn_cast<PHINode>(Instr)) {
			
 
				         VPBB->appendRecipe(new VPWidenPHIRecipe(Phi));
			
 
				         continue;
			
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@@ -452,11 +452,13 @@ public:
 
				   /// SubclassID field of the VPRecipeBase objects. They are used for concrete
			
 
				   /// type identification.
			
 
				   using VPRecipeTy = enum {
			
 
				+    VPBlendSC,
			
 
				     VPBranchOnMaskSC,
			
 
				     VPInterleaveSC,
			
 
				     VPPredInstPHISC,
			
 
				     VPReplicateSC,
			
 
				     VPWidenIntOrFpInductionSC,
			
 
				+    VPWidenMemoryInstructionSC,
			
 
				     VPWidenPHISC,
			
 
				     VPWidenSC,
			
 
				   };
			
--- a/test/Transforms/LoopVectorize/X86/x86-predication.ll
+++ b/test/Transforms/LoopVectorize/X86/x86-predication.ll
@@ -1,4 +1,5 @@
 
				 ; RUN: opt < %s -mattr=avx -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -simplifycfg -S | FileCheck %s
			
 
				+; RUN: opt -mcpu=skylake-avx512 -S -force-vector-width=8 -force-vector-interleave=1 -loop-vectorize < %s | FileCheck %s --check-prefix=SINK-GATHER
			
 
				 
			
 
				 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
			
 
				 target triple = "x86_64-apple-macosx10.8.0"
			
@@ -58,3 +59,40 @@ for.end:
 
				   %tmp8 = phi i32 [ %tmp7, %for.inc ]
			
 
				   ret i32 %tmp8
			
 
				 }
			
 
				+
			
 
				+; This test ensures that a load, which would have been widened otherwise is
			
 
				+; instead scalarized if Cost-Model so decided as part of its
			
 
				+; sink-scalar-operands optimization for predicated instructions.
			
 
				+;
			
 
				+; SINK-GATHER: vector.body:
			
 
				+; SINK-GATHER: pred.udiv.if:
			
 
				+; SINK-GATHER:   %[[T0:.+]] = load i32, i32* %{{.*}}, align 4
			
 
				+; SINK-GATHER:   %{{.*}} = udiv i32 %[[T0]], %{{.*}}
			
 
				+; SINK-GATHER: pred.udiv.continue:
			
 
				+define i32 @scalarize_and_sink_gather(i32* %a, i1 %c, i32 %x, i64 %n) {
			
 
				+entry:
			
 
				+  br label %for.body
			
 
				+
			
 
				+for.body:
			
 
				+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.inc ]
			
 
				+  %r = phi i32 [ 0, %entry ], [ %tmp6, %for.inc ]
			
 
				+  %i7 = mul i64 %i, 777
			
 
				+  br i1 %c, label %if.then, label %for.inc
			
 
				+
			
 
				+if.then:
			
 
				+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i7
			
 
				+  %tmp2 = load i32, i32* %tmp0, align 4
			
 
				+  %tmp4 = udiv i32 %tmp2, %x
			
 
				+  br label %for.inc
			
 
				+
			
 
				+for.inc:
			
 
				+  %tmp5 = phi i32 [ %x, %for.body ], [ %tmp4, %if.then]
			
 
				+  %tmp6 = add i32 %r, %tmp5
			
 
				+  %i.next = add nuw nsw i64 %i, 1
			
 
				+  %cond = icmp slt i64 %i.next, %n
			
 
				+  br i1 %cond, label %for.body, label %for.end
			
 
				+
			
 
				+for.end:
			
 
				+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
			
 
				+  ret i32 %tmp7
			
 
				+}