|
@@ -317,10 +317,12 @@ void HELPER(neon_sqdmulh_idx_h)(void *vd, void *vn, void *vm,
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
int idx = simd_data(desc);
|
|
int idx = simd_data(desc);
|
|
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 2;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 2, elements);
|
|
|
|
|
|
- for (i = 0; i < opr_sz / 2; i += 16 / 2) {
|
|
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 2) {
|
|
int16_t mm = m[i];
|
|
int16_t mm = m[i];
|
|
- for (j = 0; j < 16 / 2; ++j) {
|
|
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
|
|
d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, vq);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -333,16 +335,54 @@ void HELPER(neon_sqrdmulh_idx_h)(void *vd, void *vn, void *vm,
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
int idx = simd_data(desc);
|
|
int idx = simd_data(desc);
|
|
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 2;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 2, elements);
|
|
|
|
|
|
- for (i = 0; i < opr_sz / 2; i += 16 / 2) {
|
|
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 2) {
|
|
int16_t mm = m[i];
|
|
int16_t mm = m[i];
|
|
- for (j = 0; j < 16 / 2; ++j) {
|
|
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
|
|
d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, vq);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+void HELPER(neon_sqrdmlah_idx_h)(void *vd, void *vn, void *vm,
|
|
|
|
+ void *vq, uint32_t desc)
|
|
|
|
+{
|
|
|
|
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
|
|
+ int idx = simd_data(desc);
|
|
|
|
+ int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 2;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 2, elements);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 2) {
|
|
|
|
+ int16_t mm = m[i];
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
|
|
+ d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], false, true, vq);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+void HELPER(neon_sqrdmlsh_idx_h)(void *vd, void *vn, void *vm,
|
|
|
|
+ void *vq, uint32_t desc)
|
|
|
|
+{
|
|
|
|
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
|
|
+ int idx = simd_data(desc);
|
|
|
|
+ int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 2;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 2, elements);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 2) {
|
|
|
|
+ int16_t mm = m[i];
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
|
|
+ d[i + j] = do_sqrdmlah_h(n[i + j], mm, d[i + j], true, true, vq);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
|
|
+}
|
|
|
|
+
|
|
void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
|
|
void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
|
|
void *va, uint32_t desc)
|
|
void *va, uint32_t desc)
|
|
{
|
|
{
|
|
@@ -512,10 +552,12 @@ void HELPER(neon_sqdmulh_idx_s)(void *vd, void *vn, void *vm,
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
int idx = simd_data(desc);
|
|
int idx = simd_data(desc);
|
|
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 4;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 4, elements);
|
|
|
|
|
|
- for (i = 0; i < opr_sz / 4; i += 16 / 4) {
|
|
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 4) {
|
|
int32_t mm = m[i];
|
|
int32_t mm = m[i];
|
|
- for (j = 0; j < 16 / 4; ++j) {
|
|
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
|
|
d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, vq);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
@@ -528,16 +570,54 @@ void HELPER(neon_sqrdmulh_idx_s)(void *vd, void *vn, void *vm,
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
int idx = simd_data(desc);
|
|
int idx = simd_data(desc);
|
|
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 4;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 4, elements);
|
|
|
|
|
|
- for (i = 0; i < opr_sz / 4; i += 16 / 4) {
|
|
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 4) {
|
|
int32_t mm = m[i];
|
|
int32_t mm = m[i];
|
|
- for (j = 0; j < 16 / 4; ++j) {
|
|
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
|
|
d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, vq);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+void HELPER(neon_sqrdmlah_idx_s)(void *vd, void *vn, void *vm,
|
|
|
|
+ void *vq, uint32_t desc)
|
|
|
|
+{
|
|
|
|
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
|
|
+ int idx = simd_data(desc);
|
|
|
|
+ int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 4;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 4, elements);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 4) {
|
|
|
|
+ int32_t mm = m[i];
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
|
|
+ d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], false, true, vq);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+void HELPER(neon_sqrdmlsh_idx_s)(void *vd, void *vn, void *vm,
|
|
|
|
+ void *vq, uint32_t desc)
|
|
|
|
+{
|
|
|
|
+ intptr_t i, j, opr_sz = simd_oprsz(desc);
|
|
|
|
+ int idx = simd_data(desc);
|
|
|
|
+ int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
|
|
|
|
+ intptr_t elements = opr_sz / 4;
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / 4, elements);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < elements; i += 16 / 4) {
|
|
|
|
+ int32_t mm = m[i];
|
|
|
|
+ for (j = 0; j < eltspersegment; ++j) {
|
|
|
|
+ d[i + j] = do_sqrdmlah_s(n[i + j], mm, d[i + j], true, true, vq);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ clear_tail(d, opr_sz, simd_maxsz(desc));
|
|
|
|
+}
|
|
|
|
+
|
|
void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
|
|
void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
|
|
void *va, uint32_t desc)
|
|
void *va, uint32_t desc)
|
|
{
|
|
{
|
|
@@ -907,7 +987,7 @@ void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
|
|
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
|
|
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
|
|
uint32_t neg_real = flip ^ neg_imag;
|
|
uint32_t neg_real = flip ^ neg_imag;
|
|
intptr_t elements = opr_sz / sizeof(float16);
|
|
intptr_t elements = opr_sz / sizeof(float16);
|
|
- intptr_t eltspersegment = 16 / sizeof(float16);
|
|
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / sizeof(float16), elements);
|
|
intptr_t i, j;
|
|
intptr_t i, j;
|
|
|
|
|
|
/* Shift boolean to the sign bit so we can xor to negate. */
|
|
/* Shift boolean to the sign bit so we can xor to negate. */
|
|
@@ -969,7 +1049,7 @@ void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
|
|
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
|
|
intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
|
|
uint32_t neg_real = flip ^ neg_imag;
|
|
uint32_t neg_real = flip ^ neg_imag;
|
|
intptr_t elements = opr_sz / sizeof(float32);
|
|
intptr_t elements = opr_sz / sizeof(float32);
|
|
- intptr_t eltspersegment = 16 / sizeof(float32);
|
|
|
|
|
|
+ intptr_t eltspersegment = MIN(16 / sizeof(float32), elements);
|
|
intptr_t i, j;
|
|
intptr_t i, j;
|
|
|
|
|
|
/* Shift boolean to the sign bit so we can xor to negate. */
|
|
/* Shift boolean to the sign bit so we can xor to negate. */
|