|
@@ -70,17 +70,16 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB0_6
|
|
|
; AVX1-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
-; AVX1-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX1-NEXT: .LBB0_6: # %else5
|
|
|
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB0_8
|
|
|
; AVX1-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX1-NEXT: .LBB0_8: # %else8
|
|
@@ -111,17 +110,16 @@ define <4 x float> @gather_v4f32_ptr_v4i32(<4 x float*> %ptr, <4 x i32> %trigger
|
|
|
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB0_6
|
|
|
; AVX2-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
|
-; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX2-NEXT: .LBB0_6: # %else5
|
|
|
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB0_8
|
|
|
; AVX2-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX2-NEXT: .LBB0_8: # %else8
|
|
@@ -227,17 +225,16 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB1_6
|
|
|
; AVX1-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
-; AVX1-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX1-NEXT: .LBB1_6: # %else5
|
|
|
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB1_8
|
|
|
; AVX1-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX1-NEXT: .LBB1_8: # %else8
|
|
@@ -273,17 +270,16 @@ define <4 x float> @gather_v4f32_v4i32_v4i32(float* %base, <4 x i32> %idx, <4 x
|
|
|
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB1_6
|
|
|
; AVX2-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
|
-; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX2-NEXT: .LBB1_6: # %else5
|
|
|
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB1_8
|
|
|
; AVX2-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX2-NEXT: .LBB1_8: # %else8
|
|
@@ -388,17 +384,16 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
|
|
|
; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX1-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB2_6
|
|
|
; AVX1-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
-; AVX1-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX1-NEXT: .LBB2_6: # %else5
|
|
|
; AVX1-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB2_8
|
|
|
; AVX1-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX1-NEXT: .LBB2_8: # %else8
|
|
@@ -433,17 +428,16 @@ define <4 x float> @gather_v4f32_v4i64_v4i32(float* %base, <4 x i64> %idx, <4 x
|
|
|
; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
|
|
|
; AVX2-NEXT: vpextrb $8, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB2_6
|
|
|
; AVX2-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
|
|
|
-; AVX2-NEXT: vmovq %xmm3, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
|
|
|
; AVX2-NEXT: .LBB2_6: # %else5
|
|
|
; AVX2-NEXT: vpextrb $12, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB2_8
|
|
|
; AVX2-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
|
|
|
; AVX2-NEXT: .LBB2_8: # %else8
|
|
@@ -662,15 +656,15 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: vpinsrb $1, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_4: # %else2
|
|
|
; AVX1-NEXT: vpmovsxdq %xmm7, %xmm6
|
|
|
-; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm8
|
|
|
+; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm5
|
|
|
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7
|
|
|
; AVX1-NEXT: vpextrb $2, %xmm7, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB3_6
|
|
|
; AVX1-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
|
|
-; AVX1-NEXT: vmovq %xmm5, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_6: # %else5
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6
|
|
@@ -678,11 +672,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB3_8
|
|
|
; AVX1-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_8: # %else8
|
|
|
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0
|
|
|
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0
|
|
|
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
|
|
|
; AVX1-NEXT: vpextrb $4, %xmm5, %eax
|
|
@@ -702,15 +695,15 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: vpinsrb $5, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_12: # %else14
|
|
|
; AVX1-NEXT: vpmovsxdq %xmm6, %xmm6
|
|
|
-; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm8
|
|
|
+; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm5
|
|
|
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm7, %xmm2, %xmm7
|
|
|
; AVX1-NEXT: vpextrb $6, %xmm7, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB3_14
|
|
|
; AVX1-NEXT: # %bb.13: # %cond.load16
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
|
|
-; AVX1-NEXT: vmovq %xmm5, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_14: # %else17
|
|
|
; AVX1-NEXT: vpaddq %xmm6, %xmm4, %xmm6
|
|
@@ -718,12 +711,11 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB3_16
|
|
|
; AVX1-NEXT: # %bb.15: # %cond.load19
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_16: # %else20
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
|
|
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm0
|
|
|
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm0
|
|
|
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
|
|
|
; AVX1-NEXT: vpextrb $8, %xmm5, %eax
|
|
@@ -748,10 +740,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6
|
|
|
; AVX1-NEXT: vpextrb $10, %xmm6, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB3_22
|
|
|
; AVX1-NEXT: # %bb.21: # %cond.load28
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
|
|
|
-; AVX1-NEXT: vmovq %xmm7, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_22: # %else29
|
|
|
; AVX1-NEXT: vpaddq %xmm5, %xmm4, %xmm4
|
|
@@ -759,7 +751,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB3_24
|
|
|
; AVX1-NEXT: # %bb.23: # %cond.load31
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_24: # %else32
|
|
@@ -784,17 +775,16 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX1-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1
|
|
|
; AVX1-NEXT: vpextrb $14, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: je .LBB3_30
|
|
|
; AVX1-NEXT: # %bb.29: # %cond.load40
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
-; AVX1-NEXT: vmovq %xmm2, %rax
|
|
|
+; AVX1-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_30: # %else41
|
|
|
; AVX1-NEXT: vpextrb $15, %xmm1, %eax
|
|
|
; AVX1-NEXT: testb $1, %al
|
|
|
; AVX1-NEXT: je .LBB3_32
|
|
|
; AVX1-NEXT: # %bb.31: # %cond.load43
|
|
|
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
|
|
|
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX1-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3
|
|
|
; AVX1-NEXT: .LBB3_32: # %else44
|
|
@@ -829,10 +819,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: vpcmpeqb %xmm6, %xmm2, %xmm6
|
|
|
; AVX2-NEXT: vpextrb $2, %xmm6, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
|
|
|
; AVX2-NEXT: je .LBB3_6
|
|
|
; AVX2-NEXT: # %bb.5: # %cond.load4
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm7
|
|
|
-; AVX2-NEXT: vmovq %xmm7, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm5, %rax
|
|
|
; AVX2-NEXT: vpinsrb $2, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_6: # %else5
|
|
|
; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
|
|
@@ -840,7 +830,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB3_8
|
|
|
; AVX2-NEXT: # %bb.7: # %cond.load7
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm5
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm5, %rax
|
|
|
; AVX2-NEXT: vpinsrb $3, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_8: # %else8
|
|
@@ -865,10 +854,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
|
|
|
; AVX2-NEXT: vpextrb $6, %xmm5, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB3_14
|
|
|
; AVX2-NEXT: # %bb.13: # %cond.load16
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
|
|
|
-; AVX2-NEXT: vmovq %xmm6, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $6, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_14: # %else17
|
|
|
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm6
|
|
@@ -876,7 +865,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB3_16
|
|
|
; AVX2-NEXT: # %bb.15: # %cond.load19
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $7, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_16: # %else20
|
|
@@ -902,10 +890,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: vpcmpeqb %xmm5, %xmm2, %xmm5
|
|
|
; AVX2-NEXT: vpextrb $10, %xmm5, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB3_22
|
|
|
; AVX2-NEXT: # %bb.21: # %cond.load28
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm6
|
|
|
-; AVX2-NEXT: vmovq %xmm6, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $10, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_22: # %else29
|
|
|
; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1
|
|
@@ -913,7 +901,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB3_24
|
|
|
; AVX2-NEXT: # %bb.23: # %cond.load31
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $11, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_24: # %else32
|
|
@@ -938,17 +925,16 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1
|
|
|
; AVX2-NEXT: vpextrb $14, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: je .LBB3_30
|
|
|
; AVX2-NEXT: # %bb.29: # %cond.load40
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
|
|
|
-; AVX2-NEXT: vmovq %xmm2, %rax
|
|
|
+; AVX2-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $14, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_30: # %else41
|
|
|
; AVX2-NEXT: vpextrb $15, %xmm1, %eax
|
|
|
; AVX2-NEXT: testb $1, %al
|
|
|
; AVX2-NEXT: je .LBB3_32
|
|
|
; AVX2-NEXT: # %bb.31: # %cond.load43
|
|
|
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
|
|
|
; AVX2-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX2-NEXT: vpinsrb $15, (%rax), %xmm3, %xmm3
|
|
|
; AVX2-NEXT: .LBB3_32: # %else44
|
|
@@ -1009,9 +995,9 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: kshiftrw $4, %k0, %k1
|
|
|
; AVX512-NEXT: kmovw %k1, %eax
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
+; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
|
|
|
; AVX512-NEXT: je .LBB3_10
|
|
|
; AVX512-NEXT: # %bb.9: # %cond.load10
|
|
|
-; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
|
|
|
; AVX512-NEXT: vmovq %xmm5, %rax
|
|
|
; AVX512-NEXT: vpinsrb $4, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_10: # %else11
|
|
@@ -1020,7 +1006,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
; AVX512-NEXT: je .LBB3_12
|
|
|
; AVX512-NEXT: # %bb.11: # %cond.load13
|
|
|
-; AVX512-NEXT: vextracti32x4 $2, %zmm4, %xmm5
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm5, %rax
|
|
|
; AVX512-NEXT: vpinsrb $5, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_12: # %else14
|
|
@@ -1032,10 +1017,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: kshiftrw $6, %k0, %k1
|
|
|
; AVX512-NEXT: kmovw %k1, %eax
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
+; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4
|
|
|
; AVX512-NEXT: je .LBB3_14
|
|
|
; AVX512-NEXT: # %bb.13: # %cond.load16
|
|
|
-; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm5
|
|
|
-; AVX512-NEXT: vmovq %xmm5, %rax
|
|
|
+; AVX512-NEXT: vmovq %xmm4, %rax
|
|
|
; AVX512-NEXT: vpinsrb $6, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_14: # %else17
|
|
|
; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
|
|
@@ -1044,7 +1029,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
; AVX512-NEXT: je .LBB3_16
|
|
|
; AVX512-NEXT: # %bb.15: # %cond.load19
|
|
|
-; AVX512-NEXT: vextracti32x4 $3, %zmm4, %xmm4
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm4, %rax
|
|
|
; AVX512-NEXT: vpinsrb $7, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_16: # %else20
|
|
@@ -1098,9 +1082,9 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: kshiftrw $12, %k0, %k1
|
|
|
; AVX512-NEXT: kmovw %k1, %eax
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
+; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
|
|
|
; AVX512-NEXT: je .LBB3_26
|
|
|
; AVX512-NEXT: # %bb.25: # %cond.load34
|
|
|
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
|
|
|
; AVX512-NEXT: vmovq %xmm3, %rax
|
|
|
; AVX512-NEXT: vpinsrb $12, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_26: # %else35
|
|
@@ -1109,7 +1093,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
; AVX512-NEXT: je .LBB3_28
|
|
|
; AVX512-NEXT: # %bb.27: # %cond.load37
|
|
|
-; AVX512-NEXT: vextracti32x4 $2, %zmm0, %xmm3
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm3, %rax
|
|
|
; AVX512-NEXT: vpinsrb $13, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_28: # %else38
|
|
@@ -1120,10 +1103,10 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: kshiftrw $14, %k0, %k1
|
|
|
; AVX512-NEXT: kmovw %k1, %eax
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
+; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
|
|
|
; AVX512-NEXT: je .LBB3_30
|
|
|
; AVX512-NEXT: # %bb.29: # %cond.load40
|
|
|
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm1
|
|
|
-; AVX512-NEXT: vmovq %xmm1, %rax
|
|
|
+; AVX512-NEXT: vmovq %xmm0, %rax
|
|
|
; AVX512-NEXT: vpinsrb $14, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_30: # %else41
|
|
|
; AVX512-NEXT: kshiftrw $15, %k0, %k0
|
|
@@ -1131,7 +1114,6 @@ define <16 x i8> @gather_v16i8_v16i32_v16i8(i8* %base, <16 x i32> %idx, <16 x i8
|
|
|
; AVX512-NEXT: testb $1, %al
|
|
|
; AVX512-NEXT: je .LBB3_32
|
|
|
; AVX512-NEXT: # %bb.31: # %cond.load43
|
|
|
-; AVX512-NEXT: vextracti32x4 $3, %zmm0, %xmm0
|
|
|
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
|
|
|
; AVX512-NEXT: vpinsrb $15, (%rax), %xmm2, %xmm2
|
|
|
; AVX512-NEXT: .LBB3_32: # %else44
|