|
@@ -147,6 +147,12 @@ static cl::opt<unsigned> MinTreeSize(
|
|
|
"slp-min-tree-size", cl::init(3), cl::Hidden,
|
|
|
cl::desc("Only vectorize small trees if they are fully vectorizable"));
|
|
|
|
|
|
+// The maximum depth that the look-ahead score heuristic will explore.
|
|
|
+// The higher this value, the higher the compilation time overhead.
|
|
|
+static cl::opt<int> LookAheadMaxDepth(
|
|
|
+ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
|
|
|
+ cl::desc("The maximum look-ahead depth for operand reordering scores"));
|
|
|
+
|
|
|
static cl::opt<bool>
|
|
|
ViewSLPTree("view-slp-tree", cl::Hidden,
|
|
|
cl::desc("Display the SLP trees with Graphviz"));
|
|
@@ -708,6 +714,7 @@ public:
|
|
|
|
|
|
const DataLayout &DL;
|
|
|
ScalarEvolution &SE;
|
|
|
+ const BoUpSLP &R;
|
|
|
|
|
|
/// \returns the operand data at \p OpIdx and \p Lane.
|
|
|
OperandData &getData(unsigned OpIdx, unsigned Lane) {
|
|
@@ -733,6 +740,207 @@ public:
|
|
|
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
|
|
|
}
|
|
|
|
|
|
+ // The hard-coded scores listed here are not very important. When computing
|
|
|
+ // the scores of matching one sub-tree with another, we are basically
|
|
|
+ // counting the number of values that are matching. So even if all scores
|
|
|
+ // are set to 1, we would still get a decent matching result.
|
|
|
+ // However, sometimes we have to break ties. For example we may have to
|
|
|
+ // choose between matching loads vs matching opcodes. This is what these
|
|
|
+ // scores are helping us with: they provide the order of preference.
|
|
|
+
|
|
|
+ /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
|
|
|
+ static const int ScoreConsecutiveLoads = 3;
|
|
|
+ /// Constants.
|
|
|
+ static const int ScoreConstants = 2;
|
|
|
+ /// Instructions with the same opcode.
|
|
|
+ static const int ScoreSameOpcode = 2;
|
|
|
+ /// Instructions with alt opcodes (e.g, add + sub).
|
|
|
+ static const int ScoreAltOpcodes = 1;
|
|
|
+ /// Identical instructions (a.k.a. splat or broadcast).
|
|
|
+ static const int ScoreSplat = 1;
|
|
|
+ /// Matching with an undef is preferable to failing.
|
|
|
+ static const int ScoreUndef = 1;
|
|
|
+ /// Score for failing to find a decent match.
|
|
|
+ static const int ScoreFail = 0;
|
|
|
+ /// User external to the vectorized code.
|
|
|
+ static const int ExternalUseCost = 1;
|
|
|
+ /// The user is internal but in a different lane.
|
|
|
+ static const int UserInDiffLaneCost = ExternalUseCost;
|
|
|
+
|
|
|
+ /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
|
|
|
+ static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
|
|
|
+ ScalarEvolution &SE) {
|
|
|
+ auto *LI1 = dyn_cast<LoadInst>(V1);
|
|
|
+ auto *LI2 = dyn_cast<LoadInst>(V2);
|
|
|
+ if (LI1 && LI2)
|
|
|
+ return isConsecutiveAccess(LI1, LI2, DL, SE)
|
|
|
+ ? VLOperands::ScoreConsecutiveLoads
|
|
|
+ : VLOperands::ScoreFail;
|
|
|
+
|
|
|
+ auto *C1 = dyn_cast<Constant>(V1);
|
|
|
+ auto *C2 = dyn_cast<Constant>(V2);
|
|
|
+ if (C1 && C2)
|
|
|
+ return VLOperands::ScoreConstants;
|
|
|
+
|
|
|
+ auto *I1 = dyn_cast<Instruction>(V1);
|
|
|
+ auto *I2 = dyn_cast<Instruction>(V2);
|
|
|
+ if (I1 && I2) {
|
|
|
+ if (I1 == I2)
|
|
|
+ return VLOperands::ScoreSplat;
|
|
|
+ InstructionsState S = getSameOpcode({I1, I2});
|
|
|
+ // Note: Only consider instructions with <= 2 operands to avoid
|
|
|
+ // complexity explosion.
|
|
|
+ if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
|
|
|
+ return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
|
|
|
+ : VLOperands::ScoreSameOpcode;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (isa<UndefValue>(V2))
|
|
|
+ return VLOperands::ScoreUndef;
|
|
|
+
|
|
|
+ return VLOperands::ScoreFail;
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Holds the values and their lane that are taking part in the look-ahead
|
|
|
+ /// score calculation. This is used in the external uses cost calculation.
|
|
|
+ SmallDenseMap<Value *, int> InLookAheadValues;
|
|
|
+
|
|
|
+ /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
|
|
|
+ /// either external to the vectorized code, or require shuffling.
|
|
|
+ int getExternalUsesCost(const std::pair<Value *, int> &LHS,
|
|
|
+ const std::pair<Value *, int> &RHS) {
|
|
|
+ int Cost = 0;
|
|
|
+ SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
|
|
|
+ for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
|
|
|
+ Value *V = Values[Idx].first;
|
|
|
+ // Calculate the absolute lane, using the minimum relative lane of LHS
|
|
|
+ // and RHS as base and Idx as the offset.
|
|
|
+ int Ln = std::min(LHS.second, RHS.second) + Idx;
|
|
|
+ assert(Ln >= 0 && "Bad lane calculation");
|
|
|
+ for (User *U : V->users()) {
|
|
|
+ if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
|
|
|
+ // The user is in the VectorizableTree. Check if we need to insert.
|
|
|
+ auto It = llvm::find(UserTE->Scalars, U);
|
|
|
+ assert(It != UserTE->Scalars.end() && "U is in UserTE");
|
|
|
+ int UserLn = std::distance(UserTE->Scalars.begin(), It);
|
|
|
+ assert(UserLn >= 0 && "Bad lane");
|
|
|
+ if (UserLn != Ln)
|
|
|
+ Cost += UserInDiffLaneCost;
|
|
|
+ } else {
|
|
|
+ // Check if the user is in the look-ahead code.
|
|
|
+ auto It2 = InLookAheadValues.find(U);
|
|
|
+ if (It2 != InLookAheadValues.end()) {
|
|
|
+ // The user is in the look-ahead code. Check the lane.
|
|
|
+ if (It2->second != Ln)
|
|
|
+ Cost += UserInDiffLaneCost;
|
|
|
+ } else {
|
|
|
+ // The user is neither in SLP tree nor in the look-ahead code.
|
|
|
+ Cost += ExternalUseCost;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return Cost;
|
|
|
+ }
|
|
|
+
|
|
|
+ /// Go through the operands of \p LHS and \p RHS recursively until \p
|
|
|
+ /// MaxLevel, and return the cummulative score. For example:
|
|
|
+ /// \verbatim
|
|
|
+ /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
|
|
|
+ /// \ / \ / \ / \ /
|
|
|
+ /// + + + +
|
|
|
+ /// G1 G2 G3 G4
|
|
|
+ /// \endverbatim
|
|
|
+ /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
|
|
|
+ /// each level recursively, accumulating the score. It starts from matching
|
|
|
+ /// the additions at level 0, then moves on to the loads (level 1). The
|
|
|
+ /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
|
|
|
+ /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
|
|
|
+ /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
|
|
|
+ /// Please note that the order of the operands does not matter, as we
|
|
|
+ /// evaluate the score of all profitable combinations of operands. In
|
|
|
+ /// other words the score of G1 and G4 is the same as G1 and G2. This
|
|
|
+ /// heuristic is based on ideas described in:
|
|
|
+ /// Look-ahead SLP: Auto-vectorization in the presence of commutative
|
|
|
+ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
|
|
|
+ /// Luís F. W. Góes
|
|
|
+ int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
|
|
|
+ const std::pair<Value *, int> &RHS, int CurrLevel,
|
|
|
+ int MaxLevel) {
|
|
|
+
|
|
|
+ Value *V1 = LHS.first;
|
|
|
+ Value *V2 = RHS.first;
|
|
|
+ // Get the shallow score of V1 and V2.
|
|
|
+ int ShallowScoreAtThisLevel =
|
|
|
+ std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
|
|
|
+ getExternalUsesCost(LHS, RHS));
|
|
|
+ int Lane1 = LHS.second;
|
|
|
+ int Lane2 = RHS.second;
|
|
|
+
|
|
|
+ // If reached MaxLevel,
|
|
|
+ // or if V1 and V2 are not instructions,
|
|
|
+ // or if they are SPLAT,
|
|
|
+ // or if they are not consecutive, early return the current cost.
|
|
|
+ auto *I1 = dyn_cast<Instruction>(V1);
|
|
|
+ auto *I2 = dyn_cast<Instruction>(V2);
|
|
|
+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
|
|
|
+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
|
|
|
+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
|
|
|
+ return ShallowScoreAtThisLevel;
|
|
|
+ assert(I1 && I2 && "Should have early exited.");
|
|
|
+
|
|
|
+ // Keep track of in-tree values for determining the external-use cost.
|
|
|
+ InLookAheadValues[V1] = Lane1;
|
|
|
+ InLookAheadValues[V2] = Lane2;
|
|
|
+
|
|
|
+ // Contains the I2 operand indexes that got matched with I1 operands.
|
|
|
+ SmallSet<int, 4> Op2Used;
|
|
|
+
|
|
|
+ // Recursion towards the operands of I1 and I2. We are trying all possbile
|
|
|
+ // operand pairs, and keeping track of the best score.
|
|
|
+ for (int OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
|
|
|
+ OpIdx1 != NumOperands1; ++OpIdx1) {
|
|
|
+ // Try to pair op1I with the best operand of I2.
|
|
|
+ int MaxTmpScore = 0;
|
|
|
+ int MaxOpIdx2 = -1;
|
|
|
+ // If I2 is commutative try all combinations.
|
|
|
+ int FromIdx = isCommutative(I2) ? 0 : OpIdx1;
|
|
|
+ int ToIdx = isCommutative(I2) ? I2->getNumOperands() : OpIdx1 + 1;
|
|
|
+ assert(FromIdx < ToIdx && "Bad index");
|
|
|
+ for (int OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
|
|
|
+ // Skip operands already paired with OpIdx1.
|
|
|
+ if (Op2Used.count(OpIdx2))
|
|
|
+ continue;
|
|
|
+ // Recursively calculate the cost at each level
|
|
|
+ int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
|
|
|
+ {I2->getOperand(OpIdx2), Lane2},
|
|
|
+ CurrLevel + 1, MaxLevel);
|
|
|
+ // Look for the best score.
|
|
|
+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
|
|
|
+ MaxTmpScore = TmpScore;
|
|
|
+ MaxOpIdx2 = OpIdx2;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (MaxOpIdx2 >= 0) {
|
|
|
+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
|
|
|
+ Op2Used.insert(MaxOpIdx2);
|
|
|
+ ShallowScoreAtThisLevel += MaxTmpScore;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return ShallowScoreAtThisLevel;
|
|
|
+ }
|
|
|
+
|
|
|
+ /// \Returns the look-ahead score, which tells us how much the sub-trees
|
|
|
+ /// rooted at \p LHS and \p RHS match, the more they match the higher the
|
|
|
+ /// score. This helps break ties in an informed way when we cannot decide on
|
|
|
+ /// the order of the operands by just considering the immediate
|
|
|
+ /// predecessors.
|
|
|
+ int getLookAheadScore(const std::pair<Value *, int> &LHS,
|
|
|
+ const std::pair<Value *, int> &RHS) {
|
|
|
+ InLookAheadValues.clear();
|
|
|
+ return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
|
|
|
+ }
|
|
|
+
|
|
|
// Search all operands in Ops[*][Lane] for the one that matches best
|
|
|
// Ops[OpIdx][LastLane] and return its opreand index.
|
|
|
// If no good match can be found, return None.
|
|
@@ -750,9 +958,6 @@ public:
|
|
|
// The linearized opcode of the operand at OpIdx, Lane.
|
|
|
bool OpIdxAPO = getData(OpIdx, Lane).APO;
|
|
|
|
|
|
- const unsigned BestScore = 2;
|
|
|
- const unsigned GoodScore = 1;
|
|
|
-
|
|
|
// The best operand index and its score.
|
|
|
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
|
|
|
// are using the score to differentiate between the two.
|
|
@@ -781,41 +986,19 @@ public:
|
|
|
// Look for an operand that matches the current mode.
|
|
|
switch (RMode) {
|
|
|
case ReorderingMode::Load:
|
|
|
- if (isa<LoadInst>(Op)) {
|
|
|
- // Figure out which is left and right, so that we can check for
|
|
|
- // consecutive loads
|
|
|
- bool LeftToRight = Lane > LastLane;
|
|
|
- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
|
|
|
- Value *OpRight = (LeftToRight) ? Op : OpLastLane;
|
|
|
- if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
|
|
|
- cast<LoadInst>(OpRight), DL, SE))
|
|
|
- BestOp.Idx = Idx;
|
|
|
- }
|
|
|
- break;
|
|
|
- case ReorderingMode::Opcode:
|
|
|
- // We accept both Instructions and Undefs, but with different scores.
|
|
|
- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
|
|
|
- cast<Instruction>(Op)->getOpcode() ==
|
|
|
- cast<Instruction>(OpLastLane)->getOpcode()) ||
|
|
|
- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
|
|
|
- isa<UndefValue>(Op)) {
|
|
|
- // An instruction has a higher score than an undef.
|
|
|
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
|
|
|
- if (Score > BestOp.Score) {
|
|
|
- BestOp.Idx = Idx;
|
|
|
- BestOp.Score = Score;
|
|
|
- }
|
|
|
- }
|
|
|
- break;
|
|
|
case ReorderingMode::Constant:
|
|
|
- if (isa<Constant>(Op)) {
|
|
|
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
|
|
|
- if (Score > BestOp.Score) {
|
|
|
- BestOp.Idx = Idx;
|
|
|
- BestOp.Score = Score;
|
|
|
- }
|
|
|
+ case ReorderingMode::Opcode: {
|
|
|
+ bool LeftToRight = Lane > LastLane;
|
|
|
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
|
|
|
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
|
|
|
+ unsigned Score =
|
|
|
+ getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
|
|
|
+ if (Score > BestOp.Score) {
|
|
|
+ BestOp.Idx = Idx;
|
|
|
+ BestOp.Score = Score;
|
|
|
}
|
|
|
break;
|
|
|
+ }
|
|
|
case ReorderingMode::Splat:
|
|
|
if (Op == OpLastLane)
|
|
|
BestOp.Idx = Idx;
|
|
@@ -946,8 +1129,8 @@ public:
|
|
|
public:
|
|
|
/// Initialize with all the operands of the instruction vector \p RootVL.
|
|
|
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
|
|
|
- ScalarEvolution &SE)
|
|
|
- : DL(DL), SE(SE) {
|
|
|
+ ScalarEvolution &SE, const BoUpSLP &R)
|
|
|
+ : DL(DL), SE(SE), R(R) {
|
|
|
// Append all the operands of RootVL.
|
|
|
appendOperandsOfVL(RootVL);
|
|
|
}
|
|
@@ -1169,7 +1352,8 @@ private:
|
|
|
SmallVectorImpl<Value *> &Left,
|
|
|
SmallVectorImpl<Value *> &Right,
|
|
|
const DataLayout &DL,
|
|
|
- ScalarEvolution &SE);
|
|
|
+ ScalarEvolution &SE,
|
|
|
+ const BoUpSLP &R);
|
|
|
struct TreeEntry {
|
|
|
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
|
|
|
TreeEntry(VecTreeTy &Container) : Container(Container) {}
|
|
@@ -2371,7 +2555,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|
|
// Commutative predicate - collect + sort operands of the instructions
|
|
|
// so that each side is more likely to have the same opcode.
|
|
|
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
|
|
|
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
|
|
|
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
|
|
|
} else {
|
|
|
// Collect operands - commute if it uses the swapped predicate.
|
|
|
for (Value *V : VL) {
|
|
@@ -2415,7 +2599,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|
|
// have the same opcode.
|
|
|
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
|
|
|
ValueList Left, Right;
|
|
|
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
|
|
|
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
|
|
|
buildTree_rec(Left, Depth + 1, {TE, 0});
|
|
|
buildTree_rec(Right, Depth + 1, {TE, 1});
|
|
|
return;
|
|
@@ -2584,7 +2768,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
|
|
|
// Reorder operands if reordering would enable vectorization.
|
|
|
if (isa<BinaryOperator>(VL0)) {
|
|
|
ValueList Left, Right;
|
|
|
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
|
|
|
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
|
|
|
buildTree_rec(Left, Depth + 1, {TE, 0});
|
|
|
buildTree_rec(Right, Depth + 1, {TE, 1});
|
|
|
return;
|
|
@@ -3299,13 +3483,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
|
|
|
|
|
|
// Perform operand reordering on the instructions in VL and return the reordered
|
|
|
// operands in Left and Right.
|
|
|
-void BoUpSLP::reorderInputsAccordingToOpcode(
|
|
|
- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
|
|
|
- SmallVectorImpl<Value *> &Right, const DataLayout &DL,
|
|
|
- ScalarEvolution &SE) {
|
|
|
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
|
|
+ SmallVectorImpl<Value *> &Left,
|
|
|
+ SmallVectorImpl<Value *> &Right,
|
|
|
+ const DataLayout &DL,
|
|
|
+ ScalarEvolution &SE,
|
|
|
+ const BoUpSLP &R) {
|
|
|
if (VL.empty())
|
|
|
return;
|
|
|
- VLOperands Ops(VL, DL, SE);
|
|
|
+ VLOperands Ops(VL, DL, SE, R);
|
|
|
// Reorder the operands in place.
|
|
|
Ops.reorder();
|
|
|
Left = Ops.getVL(0);
|