|
@@ -32,223 +32,190 @@
|
|
|
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni")))
|
|
|
|
|
|
|
|
|
+static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
+_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
+{
|
|
|
+ return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
|
|
|
+ (__v8si)__B);
|
|
|
+}
|
|
|
+
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
|
|
|
+ (__v8si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
|
|
|
+ (__v8si)_mm256_setzero_si256());
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
+_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
|
|
|
+ (__v8si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
|
|
|
+ (__v8si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
-}
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
|
|
|
+ (__v8si)_mm256_setzero_si256());
|
|
|
+
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
+_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
|
|
|
+ (__v8si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
|
|
|
+ (__v8si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
|
|
|
+ (__v8si)_mm256_setzero_si256());
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
+_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
|
|
|
+ (__v8si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
|
|
|
+ (__v8si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m256i)__builtin_ia32_selectd_256(__U,
|
|
|
+ (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
|
|
|
+ (__v8si)_mm256_setzero_si256());
|
|
|
}
|
|
|
|
|
|
-static __inline__ __m256i __DEFAULT_FN_ATTRS
|
|
|
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
|
|
|
+static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
+_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S,
|
|
|
- (__v8si) __A,
|
|
|
- (__v8si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
|
|
|
+ (__v4si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
|
|
|
+ (__v4si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
|
|
|
+ (__v4si)_mm_setzero_si128());
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
+_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
|
|
|
+ (__v4si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
|
|
|
+ (__v4si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
|
|
|
+ (__v4si)_mm_setzero_si128());
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
+_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
|
|
|
+ (__v4si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
|
|
|
+ (__v4si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
|
|
|
+ (__v4si)_mm_setzero_si128());
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
+_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
+ return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
|
|
|
+ (__v4si)__B);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
|
|
|
+ (__v4si)__S);
|
|
|
}
|
|
|
|
|
|
static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
|
|
|
{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) __U);
|
|
|
+ return (__m128i)__builtin_ia32_selectd_128(__U,
|
|
|
+ (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
|
|
|
+ (__v4si)_mm_setzero_si128());
|
|
|
}
|
|
|
|
|
|
-static __inline__ __m128i __DEFAULT_FN_ATTRS
|
|
|
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
|
|
|
-{
|
|
|
- return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S,
|
|
|
- (__v4si) __A,
|
|
|
- (__v4si) __B,
|
|
|
- (__mmask8) -1);
|
|
|
-}
|
|
|
-
|
|
|
-
|
|
|
#undef __DEFAULT_FN_ATTRS
|
|
|
|
|
|
#endif
|