il y a 5 ans · 2acd87bbf2
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -39841,6 +39841,21 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
 
				     if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
			
 
				       return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
			
 
				   }
			
 
				+
			
 
				+  // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is
			
 
				+  // split across two registers. We can use a packusdw+perm to clamp to 0-65535
			
 
				+  // and concatenate at the same time. Then we can use a final vpmovuswb to
			
 
				+  // clip to 0-255.
			
 
				+  if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
			
 
				+      InVT == MVT::v16i32 && VT == MVT::v16i8) {
			
 
				+    if (auto USatVal = detectSSatPattern(In, VT, true)) {
			
 
				+      // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB.
			
 
				+      SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal,
			
 
				+                                           DL, DAG, Subtarget);
			
 
				+      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				   if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
			
 
				       !(Subtarget.hasAVX512() && InSVT == MVT::i32) &&
			
 
				       !(Subtarget.hasBWI() && InSVT == MVT::i16) &&
			
--- a/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/test/CodeGen/X86/min-legal-vector-width.ll
@@ -1083,12 +1083,10 @@ define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>
 
				 define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" {
			
 
				 ; CHECK-LABEL: trunc_packus_v16i32_v16i8:
			
 
				 ; CHECK:       # %bb.0:
			
 
				-; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
			
 
				-; CHECK-NEXT:    vpmaxsd 32(%rdi), %ymm0, %ymm1
			
 
				-; CHECK-NEXT:    vpmovusdb %ymm1, %xmm1
			
 
				-; CHECK-NEXT:    vpmaxsd (%rdi), %ymm0, %ymm0
			
 
				-; CHECK-NEXT:    vpmovusdb %ymm0, %xmm0
			
 
				-; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
			
 
				+; CHECK-NEXT:    vmovdqa (%rdi), %ymm0
			
 
				+; CHECK-NEXT:    vpackusdw 32(%rdi), %ymm0, %ymm0
			
 
				+; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
			
 
				+; CHECK-NEXT:    vpmovuswb %ymm0, %xmm0
			
 
				 ; CHECK-NEXT:    vzeroupper
			
 
				 ; CHECK-NEXT:    retq
			
 
				   %a = load <16 x i32>, <16 x i32>* %p