|
@@ -14,15 +14,14 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
|
|
|
; SI: ; %bb.0:
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
|
|
-; SI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
-; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
-; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; SI-NEXT: s_mov_b32 s4, 0x40a00000
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
+; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -30,15 +29,14 @@ define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out,
|
|
|
; VI: ; %bb.0:
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
|
|
-; VI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
-; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
-; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; VI-NEXT: s_mov_b32 s4, 0x40a00000
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
+; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
|
|
@@ -51,15 +49,14 @@ define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out,
|
|
|
; SI: ; %bb.0:
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
|
|
-; SI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; SI-NEXT: s_mov_b32 s5, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
-; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; SI-NEXT: v_mov_b32_e32 v1, s8
|
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -67,15 +64,14 @@ define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out,
|
|
|
; VI: ; %bb.0:
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
|
|
-; VI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; VI-NEXT: s_mov_b32 s5, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
-; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; VI-NEXT: v_mov_b32_e32 v1, s8
|
|
|
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
|
|
@@ -88,15 +84,14 @@ define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out,
|
|
|
; SI: ; %bb.0:
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
|
|
-; SI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; SI-NEXT: s_mov_b32 s6, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
-; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
-; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
+; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; SI-NEXT: v_mov_b32_e32 v2, s8
|
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -104,15 +99,14 @@ define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out,
|
|
|
; VI: ; %bb.0:
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
|
|
-; VI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; VI-NEXT: s_mov_b32 s6, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
-; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
-; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
+; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
-; VI-NEXT: v_mov_b32_e32 v2, s8
|
|
|
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
|
|
@@ -125,15 +119,14 @@ define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out,
|
|
|
; SI: ; %bb.0:
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x4
|
|
|
-; SI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; SI-NEXT: s_mov_b32 s7, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
-; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
-; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
-; SI-NEXT: v_mov_b32_e32 v3, s8
|
|
|
+; SI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -141,15 +134,14 @@ define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out,
|
|
|
; VI: ; %bb.0:
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
|
|
|
-; VI-NEXT: s_mov_b32 s8, 0x40a00000
|
|
|
+; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
+; VI-NEXT: s_mov_b32 s7, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
-; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
-; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
-; VI-NEXT: v_mov_b32_e32 v3, s8
|
|
|
+; VI-NEXT: v_mov_b32_e32 v3, s7
|
|
|
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
|
|
@@ -201,8 +193,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out,
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0x40a00000
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -214,8 +206,8 @@ define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out,
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, 0x40a00000
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
|
|
@@ -232,8 +224,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out,
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; SI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -245,8 +237,8 @@ define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out,
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; VI-NEXT: v_mov_b32_e32 v0, s4
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s5
|
|
|
; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
|
|
@@ -293,16 +285,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
|
|
|
; SI-NEXT: s_load_dword s4, s[4:5], 0x4
|
|
|
-; SI-NEXT: v_mov_b32_e32 v2, 0x40a00000
|
|
|
+; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s7
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s7
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s6
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; SI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
|
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -311,16 +303,16 @@ define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)*
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x8
|
|
|
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
|
|
|
-; VI-NEXT: v_mov_b32_e32 v2, 0x40a00000
|
|
|
+; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s7
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s7
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s6
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; VI-NEXT: v_mov_b32_e32 v2, s6
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
|
|
|
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
|
|
@@ -334,19 +326,19 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
|
|
|
; SI-NEXT: s_load_dword s4, s[4:5], 0x8
|
|
|
-; SI-NEXT: v_mov_b32_e32 v3, 0x40a00000
|
|
|
+; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; SI-NEXT: v_mov_b32_e32 v2, s10
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s10
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
|
|
; SI-NEXT: v_mov_b32_e32 v1, s9
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; SI-NEXT: v_mov_b32_e32 v3, s8
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
|
|
; SI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -355,19 +347,19 @@ define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)*
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
|
|
; VI-NEXT: s_load_dword s4, s[4:5], 0x20
|
|
|
-; VI-NEXT: v_mov_b32_e32 v3, 0x40a00000
|
|
|
+; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; VI-NEXT: v_mov_b32_e32 v2, s10
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s10
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
|
|
; VI-NEXT: v_mov_b32_e32 v1, s9
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; VI-NEXT: v_mov_b32_e32 v3, s8
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
|
|
|
; VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
|
|
@@ -381,22 +373,22 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
|
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x4
|
|
|
; SI-NEXT: s_load_dword s4, s[4:5], 0x8
|
|
|
-; SI-NEXT: v_mov_b32_e32 v4, 0x40a00000
|
|
|
+; SI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; SI-NEXT: s_mov_b32 s3, 0x100f000
|
|
|
; SI-NEXT: s_mov_b32 s2, -1
|
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s11
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s11
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s10
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s10
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s9
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
|
|
+; SI-NEXT: v_mov_b32_e32 v1, s9
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
|
|
|
-; SI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; SI-NEXT: v_mov_b32_e32 v4, s8
|
|
|
; SI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
|
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
|
|
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; SI-NEXT: s_endpgm
|
|
|
;
|
|
@@ -405,22 +397,22 @@ define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)*
|
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
|
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x10
|
|
|
; VI-NEXT: s_load_dword s4, s[4:5], 0x20
|
|
|
-; VI-NEXT: v_mov_b32_e32 v4, 0x40a00000
|
|
|
+; VI-NEXT: v_mov_b32_e32 v0, 0x40a00000
|
|
|
; VI-NEXT: s_mov_b32 s3, 0x1100f000
|
|
|
; VI-NEXT: s_mov_b32 s2, -1
|
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s11
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s11
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 3
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v3, v4, v0, vcc
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s10
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s10
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 2
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s9
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc
|
|
|
+; VI-NEXT: v_mov_b32_e32 v1, s9
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 1
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v1, v4, v0, vcc
|
|
|
-; VI-NEXT: v_mov_b32_e32 v0, s8
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
|
|
|
+; VI-NEXT: v_mov_b32_e32 v4, s8
|
|
|
; VI-NEXT: v_cmp_ne_u32_e64 vcc, s4, 0
|
|
|
-; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
|
|
|
+; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
|
|
|
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
|
; VI-NEXT: s_endpgm
|
|
|
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
|