瀏覽代碼

[LV] fold-tail predication should be respected even with assume_safety

assume_safety implies that loads under "if's" can be safely executed
speculatively (unguarded, unmasked). However this assumption holds only for the
original user "if's", not those introduced by the compiler, such as the
fold-tail "if" that guards us from loading beyond the original loop trip-count.
Currently the combination of fold-tail and assume-safety pragmas results in
ignoring the fold-tail predicate that guards the loads, generating unmasked
loads. This patch fixes this behavior.

Differential Revision: https://reviews.llvm.org/D66106

Reviewers: Ayal, hsaito, fhahn



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368973 91177308-0d34-0410-b5e6-96231b3b80d8
Dorit Nuzman 6 年之前
父節點
當前提交
ecfc353229

+ 12 - 5
include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

@@ -228,8 +228,8 @@ public:
   bool canVectorize(bool UseVPlanNativePath);
   bool canVectorize(bool UseVPlanNativePath);
 
 
   /// Return true if we can vectorize this loop while folding its tail by
   /// Return true if we can vectorize this loop while folding its tail by
-  /// masking.
-  bool canFoldTailByMasking();
+  /// masking, and mark all respective loads/stores for masking.
+  bool prepareToFoldTailByMasking();
 
 
   /// Returns the primary induction variable.
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
@@ -355,9 +355,16 @@ private:
   bool canVectorizeOuterLoop();
   bool canVectorizeOuterLoop();
 
 
   /// Return true if all of the instructions in the block can be speculatively
   /// Return true if all of the instructions in the block can be speculatively
-  /// executed. \p SafePtrs is a list of addresses that are known to be legal
-  /// and we know that we can read from them without segfault.
-  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs);
+  /// executed, and record the loads/stores that require masking. If's that
+  /// guard loads can be ignored under "assume safety" unless \p PreserveGuards
+  /// is true. This can happen when we introduces guards for which the original
+  /// "unguarded-loads are safe" assumption does not hold. For example, the
+  /// vectorizer's fold-tail transformation changes the loop to execute beyond
+  /// its original trip-count, under a proper guard, which should be preserved.
+  /// \p SafePtrs is a list of addresses that are known to be legal and we know
+  /// that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+                            bool PreserveGuards = false);
 
 
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// This can set \p Phi as the main induction of the loop if \p Phi is a
   /// This can set \p Phi as the main induction of the loop if \p Phi is a

+ 4 - 4
lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

@@ -869,7 +869,7 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
 }
 }
 
 
 bool LoopVectorizationLegality::blockCanBePredicated(
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
 
   for (Instruction &I : *BB) {
   for (Instruction &I : *BB) {
@@ -888,7 +888,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(
         // !llvm.mem.parallel_loop_access implies if-conversion safety.
         // !llvm.mem.parallel_loop_access implies if-conversion safety.
         // Otherwise, record that the load needs (real or emulated) masking
         // Otherwise, record that the load needs (real or emulated) masking
         // and let the cost model decide.
         // and let the cost model decide.
-        if (!IsAnnotatedParallel)
+        if (!IsAnnotatedParallel || PreserveGuards)
           MaskedOp.insert(LI);
           MaskedOp.insert(LI);
         continue;
         continue;
       }
       }
@@ -1159,7 +1159,7 @@ bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
   return Result;
   return Result;
 }
 }
 
 
-bool LoopVectorizationLegality::canFoldTailByMasking() {
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
 
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
 
@@ -1202,7 +1202,7 @@ bool LoopVectorizationLegality::canFoldTailByMasking() {
   // Check and mark all blocks for predication, including those that ordinarily
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers)) {
+    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
       reportVectorizationFailure(
       reportVectorizationFailure(
           "Cannot fold tail by masking as required",
           "Cannot fold tail by masking as required",
           "control flow cannot be substituted for a select",
           "control flow cannot be substituted for a select",

+ 1 - 1
lib/Transforms/Vectorize/LoopVectorize.cpp

@@ -4853,7 +4853,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
   // found modulo the vectorization factor is not zero, try to fold the tail
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
-  if (Legal->canFoldTailByMasking()) {
+  if (Legal->prepareToFoldTailByMasking()) {
     FoldTailByMasking = true;
     FoldTailByMasking = true;
     return MaxVF;
     return MaxVF;
   }
   }

+ 166 - 0
test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll

@@ -0,0 +1,166 @@
+; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-linux-gnu"
+
+; Case1: With pragma predicate to force tail-folding.
+; All memory opertions are masked.
+;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) {
+;   #pragma clang loop vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
+}
+
+; Case2: With pragma assume_safety only the store is masked.
+; void assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @assume_safety
+;CHECK: vector.body:
+;CHECK-NOT: @llvm.masked.load
+;CHECK:  call void @llvm.masked.store
+
+; Function Attrs: norecurse nounwind uwtable
+define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 {
+  %5 = sext i32 %3 to i64
+  br label %7
+
+; <label>:6:
+  ret void
+
+; <label>:7:
+  %8 = phi i64 [ 0, %4 ], [ %18, %17 ]
+  %9 = icmp sgt i64 %8, %5
+  br i1 %9, label %10, label %17
+
+; <label>:10:
+  %11 = getelementptr inbounds i32, i32* %1, i64 %8
+  %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %13 = getelementptr inbounds i32, i32* %2, i64 %8
+  %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  %15 = add nsw i32 %14, %12
+  %16 = getelementptr inbounds i32, i32* %0, i64 %8
+  store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
+  br label %17
+
+; <label>:17:
+  %18 = add nuw nsw i64 %8, 1
+  %19 = icmp eq i64 %18, 1021
+  br i1 %19, label %6, label %7, !llvm.loop !6
+}
+
+; Case3: With pragma assume_safety and pragma predicate both the store and the
+; load are masked.
+; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) {
+;   #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable)
+;   for(int ix=0; ix < 1021; ++ix) {
+;     if (ix > guard) {
+;       p[ix] = q1[ix] + q2[ix];
+;     }
+;   }
+;}
+
+;CHECK-LABEL: @fold_tail_and_assume_safety
+;CHECK: vector.body:
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call <8 x i32> @llvm.masked.load
+;CHECK: call void @llvm.masked.store
+
+; Function Attrs: nofree norecurse nounwind uwtable
+define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2, 
+i32 %guard) local_unnamed_addr #0 {
+entry:
+  %0 = sext i32 %guard to i64
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp1 = icmp sgt i64 %indvars.iv, %0
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:
+  %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10
+  %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
+  %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10
+  %add = add nsw i32 %2, %1
+  %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1021
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
+}
+
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"}
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+!6 = distinct !{!6, !7}
+!7 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!8 = distinct !{!8, !9}
+!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+
+!10 = distinct !{}
+!11 = distinct !{!11, !12, !13}
+!12 = !{!"llvm.loop.parallel_accesses", !10}
+!13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}

+ 11 - 11
test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll

@@ -102,17 +102,17 @@ define void @vectorized1(float* noalias nocapture %A, float* noalias nocapture r
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
-; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP7]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x float> undef), !llvm.access.group !6
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd fast <8 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !7