mirror of
https://github.com/rust-lang/rust.git
synced 2026-05-30 04:56:25 +03:00
Implemented VEX versions
Modified stdarch-test to accept VEX versions
This commit is contained in:
committed by
Amanieu d'Antras
parent
775dcaabde
commit
f22fab559e
@@ -1135,82 +1135,20 @@
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX_IFMA"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64)
|
||||
* [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64)
|
||||
* [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64)
|
||||
* [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX_NE_CONVERT"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
|
||||
* [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
|
||||
* [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
|
||||
* [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
|
||||
* [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
|
||||
* [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
|
||||
* [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
|
||||
* [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
|
||||
* [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
|
||||
* [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
|
||||
* [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
|
||||
* [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
|
||||
* [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
|
||||
* [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
|
||||
* [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX_VNNI"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32)
|
||||
* [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32)
|
||||
* [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32)
|
||||
* [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32)
|
||||
* [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32)
|
||||
* [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32)
|
||||
* [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32)
|
||||
* [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX_VNNI_INT16"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32)
|
||||
* [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32)
|
||||
* [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32)
|
||||
* [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32)
|
||||
* [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32)
|
||||
* [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32)
|
||||
* [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32)
|
||||
* [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32)
|
||||
* [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32)
|
||||
* [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32)
|
||||
* [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32)
|
||||
* [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["AVX_VNNI_INT8"]</summary><p>
|
||||
|
||||
* [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32)
|
||||
* [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32)
|
||||
* [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32)
|
||||
* [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32)
|
||||
* [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32)
|
||||
* [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32)
|
||||
* [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32)
|
||||
* [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32)
|
||||
* [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32)
|
||||
* [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32)
|
||||
* [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32)
|
||||
* [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32)
|
||||
</p></details>
|
||||
|
||||
|
||||
<details><summary>["CET_SS"]</summary><p>
|
||||
|
||||
* [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)
|
||||
|
||||
@@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
|
||||
simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpmadd52huq)
|
||||
)]
|
||||
pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
vpmadd52huq_256(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
@@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
|
||||
simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpmadd52luq)
|
||||
)]
|
||||
pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
|
||||
vpmadd52luq_256(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
@@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
|
||||
simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpmadd52huq)
|
||||
)]
|
||||
pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
vpmadd52huq_128(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
@@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
|
||||
simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
/// corresponding unsigned 64-bit integer in `a`, and store the
|
||||
/// results in `dst`.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxifma")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpmadd52luq)
|
||||
)]
|
||||
pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
|
||||
vpmadd52luq_128(a, b, c)
|
||||
}
|
||||
|
||||
/// Multiply packed unsigned 52-bit integers in each 64-bit element of
|
||||
/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
|
||||
/// unsigned integer from the intermediate result with the
|
||||
@@ -427,6 +499,20 @@ unsafe fn test_mm512_maskz_madd52lo_epu64() {
|
||||
assert_eq_m512i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxifma")]
|
||||
unsafe fn test_mm256_madd52hi_avx_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_madd52hi_avx_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm256_set1_epi64x(11030549757952);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_madd52hi_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
@@ -471,6 +557,20 @@ unsafe fn test_mm256_maskz_madd52hi_epu64() {
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxifma")]
|
||||
unsafe fn test_mm256_madd52lo_avx_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
let b = _mm256_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm256_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm256_madd52lo_avx_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let expected = _mm256_set1_epi64x(100055558127628);
|
||||
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm256_madd52lo_epu64() {
|
||||
let a = _mm256_set1_epi64x(10 << 40);
|
||||
@@ -515,6 +615,20 @@ unsafe fn test_mm256_maskz_madd52lo_epu64() {
|
||||
assert_eq_m256i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxifma")]
|
||||
unsafe fn test_mm_madd52hi_avx_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_madd52hi_avx_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
|
||||
let expected = _mm_set1_epi64x(11030549757952);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_madd52hi_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
@@ -559,6 +673,20 @@ unsafe fn test_mm_maskz_madd52hi_epu64() {
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxifma")]
|
||||
unsafe fn test_mm_madd52lo_avx_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
let b = _mm_set1_epi64x((11 << 40) + 4);
|
||||
let c = _mm_set1_epi64x((12 << 40) + 3);
|
||||
|
||||
let actual = _mm_madd52lo_avx_epu64(a, b, c);
|
||||
|
||||
// (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
|
||||
let expected = _mm_set1_epi64x(100055558127628);
|
||||
|
||||
assert_eq_m128i(expected, actual);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512ifma,avx512vl")]
|
||||
unsafe fn test_mm_madd52lo_epu64() {
|
||||
let a = _mm_set1_epi64x(10 << 40);
|
||||
|
||||
@@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwssd)
|
||||
)]
|
||||
pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
|
||||
@@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwssd)
|
||||
)]
|
||||
pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
|
||||
@@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwssds)
|
||||
)]
|
||||
pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
|
||||
@@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwssds)
|
||||
)]
|
||||
pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
|
||||
@@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbusd)
|
||||
)]
|
||||
pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
|
||||
@@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbusd)
|
||||
)]
|
||||
pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
|
||||
@@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbusds)
|
||||
)]
|
||||
pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
|
||||
@@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnni")]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbusds)
|
||||
)]
|
||||
pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
|
||||
@@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
|
||||
transmute(simd_select_bitmask(k, r, zero))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbssd)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbssd)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbssds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbssds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbsud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbsud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbsuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbsuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbuud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbuud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbuuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
|
||||
/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint8")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpbuuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwsud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwsud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwsuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwsuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwusd)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwusd)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwusds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwusds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwuud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwuud)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwuuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
|
||||
transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
|
||||
}
|
||||
|
||||
/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
|
||||
/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
|
||||
/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxvnniint16")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vpdpwuuds)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
|
||||
transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.avx512.vpdpwssd.512"]
|
||||
@@ -555,6 +1051,66 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
|
||||
fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
#[link_name = "llvm.x86.avx512.vpdpbusds.128"]
|
||||
fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbssd.128"]
|
||||
fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbssd.256"]
|
||||
fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbssds.128"]
|
||||
fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbssds.256"]
|
||||
fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbsud.128"]
|
||||
fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbsud.256"]
|
||||
fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
|
||||
fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
|
||||
fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbuud.128"]
|
||||
fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbuud.256"]
|
||||
fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
|
||||
fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
|
||||
fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwsud.128"]
|
||||
fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwsud.256"]
|
||||
fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
|
||||
fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
|
||||
fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwusd.128"]
|
||||
fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwusd.256"]
|
||||
fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwusds.128"]
|
||||
fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwusds.256"]
|
||||
fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwuud.128"]
|
||||
fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwuud.256"]
|
||||
fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
|
||||
#[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
|
||||
fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
|
||||
#[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
|
||||
fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -597,6 +1153,16 @@ unsafe fn test_mm512_maskz_dpwssd_epi32() {
|
||||
assert_eq_m512i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm256_dpwssd_avx_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwssd_avx_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm256_dpwssd_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
@@ -631,6 +1197,16 @@ unsafe fn test_mm256_maskz_dpwssd_epi32() {
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm_dpwssd_avx_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwssd_avx_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm_dpwssd_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
@@ -699,6 +1275,16 @@ unsafe fn test_mm512_maskz_dpwssds_epi32() {
|
||||
assert_eq_m512i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm256_dpwssds_avx_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwssds_avx_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm256_dpwssds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
@@ -733,6 +1319,16 @@ unsafe fn test_mm256_maskz_dpwssds_epi32() {
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm_dpwssds_avx_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwssds_avx_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm_dpwssds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
@@ -801,6 +1397,16 @@ unsafe fn test_mm512_maskz_dpbusd_epi32() {
|
||||
assert_eq_m512i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm256_dpbusd_avx_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbusd_avx_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm256_dpbusd_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
@@ -835,6 +1441,16 @@ unsafe fn test_mm256_maskz_dpbusd_epi32() {
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm_dpbusd_avx_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbusd_avx_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm_dpbusd_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
@@ -903,6 +1519,16 @@ unsafe fn test_mm512_maskz_dpbusds_epi32() {
|
||||
assert_eq_m512i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm256_dpbusds_avx_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbusds_avx_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm256_dpbusds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
@@ -937,6 +1563,16 @@ unsafe fn test_mm256_maskz_dpbusds_epi32() {
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnni")]
|
||||
unsafe fn test_mm_dpbusds_avx_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbusds_avx_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avx512vnni,avx512vl")]
|
||||
unsafe fn test_mm_dpbusds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
@@ -970,4 +1606,244 @@ unsafe fn test_mm_maskz_dpbusds_epi32() {
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbssd_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbssd_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbssd_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbssd_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbssds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbssds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbssds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbssds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbsud_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbsud_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbsud_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbsud_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbsuds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbsuds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbsuds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbsuds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbuud_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbuud_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbuud_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbuud_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm_dpbuuds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm_dpbuuds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(5);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint8")]
|
||||
unsafe fn test_mm256_dpbuuds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
|
||||
let r = _mm256_dpbuuds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(5);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwsud_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwsud_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwsud_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwsud_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwsuds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwsuds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwsuds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwsuds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwusd_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwusd_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwusd_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwusd_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwusds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwusds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwusds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwusds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwuud_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwuud_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwuud_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwuud_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm_dpwuuds_epi32() {
|
||||
let src = _mm_set1_epi32(1);
|
||||
let a = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm_dpwuuds_epi32(src, a, b);
|
||||
let e = _mm_set1_epi32(3);
|
||||
assert_eq_m128i(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxvnniint16")]
|
||||
unsafe fn test_mm256_dpwuuds_epi32() {
|
||||
let src = _mm256_set1_epi32(1);
|
||||
let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
|
||||
let r = _mm256_dpwuuds_epi32(src, a, b);
|
||||
let e = _mm256_set1_epi32(3);
|
||||
assert_eq_m256i(r, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,188 @@
|
||||
use crate::core_arch::{simd::*, x86::*};
|
||||
|
||||
#[cfg(test)]
|
||||
use stdarch_test::assert_instr;
|
||||
|
||||
/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
|
||||
/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
|
||||
/// floating-point elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vbcstnebf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 {
|
||||
transmute(bcstnebf162ps_128(a))
|
||||
}
|
||||
|
||||
/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
|
||||
/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
|
||||
/// elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vbcstnebf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_bcstnebf16_ps(a: *const u16) -> __m256 {
|
||||
transmute(bcstnebf162ps_256(a))
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
|
||||
/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vcvtneebf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
|
||||
transmute(cvtneebf162ps_128(a))
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
|
||||
/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vcvtneebf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
|
||||
transmute(cvtneebf162ps_256(a))
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
|
||||
/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vcvtneobf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
|
||||
transmute(cvtneobf162ps_128(a))
|
||||
}
|
||||
|
||||
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
|
||||
/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
|
||||
///
|
||||
/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
|
||||
#[inline]
|
||||
#[target_feature(enable = "avxneconvert")]
|
||||
#[cfg_attr(
|
||||
all(test, any(target_os = "linux", target_env = "msvc")),
|
||||
assert_instr(vcvtneobf162ps)
|
||||
)]
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
|
||||
transmute(cvtneobf162ps_256(a))
|
||||
}
|
||||
|
||||
#[allow(improper_ctypes)]
|
||||
extern "C" {
|
||||
#[link_name = "llvm.x86.vbcstnebf162ps128"]
|
||||
fn bcstnebf162ps_128(a: *const u16) -> f32x4;
|
||||
#[link_name = "llvm.x86.vbcstnebf162ps256"]
|
||||
fn bcstnebf162ps_256(a: *const u16) -> f32x8;
|
||||
|
||||
#[link_name = "llvm.x86.vcvtneebf162ps128"]
|
||||
fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
|
||||
#[link_name = "llvm.x86.vcvtneebf162ps256"]
|
||||
fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
|
||||
|
||||
#[link_name = "llvm.x86.vcvtneobf162ps128"]
|
||||
fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
|
||||
#[link_name = "llvm.x86.vcvtneobf162ps256"]
|
||||
fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::core_arch::x86::*;
|
||||
use std::ptr::addr_of;
|
||||
use stdarch_test::simd_test;
|
||||
|
||||
const BF16_ONE: u16 = 0b0_01111111_0000000;
|
||||
const BF16_TWO: u16 = 0b0_10000000_0000000;
|
||||
const BF16_THREE: u16 = 0b0_10000000_1000000;
|
||||
const BF16_FOUR: u16 = 0b0_10000001_0000000;
|
||||
const BF16_FIVE: u16 = 0b0_10000001_0100000;
|
||||
const BF16_SIX: u16 = 0b0_10000001_1000000;
|
||||
const BF16_SEVEN: u16 = 0b0_10000001_1100000;
|
||||
const BF16_EIGHT: u16 = 0b0_10000010_0000000;
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm_bcstnebf16_ps() {
|
||||
let a = BF16_ONE;
|
||||
let r = _mm_bcstnebf16_ps(addr_of!(a));
|
||||
let e = _mm_set_ps(1., 1., 1., 1.);
|
||||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm256_bcstnebf16_ps() {
|
||||
let a = BF16_ONE;
|
||||
let r = _mm256_bcstnebf16_ps(addr_of!(a));
|
||||
let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
|
||||
assert_eq_m256(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm_cvtneebf16_ps() {
|
||||
let a = __m128bh(
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
);
|
||||
let r = _mm_cvtneebf16_ps(addr_of!(a));
|
||||
let e = _mm_setr_ps(1., 3., 5., 7.);
|
||||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm256_cvtneebf16_ps() {
|
||||
let a = __m256bh(
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
);
|
||||
let r = _mm256_cvtneebf16_ps(addr_of!(a));
|
||||
let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
|
||||
assert_eq_m256(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm_cvtneobf16_ps() {
|
||||
let a = __m128bh(
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
);
|
||||
let r = _mm_cvtneobf16_ps(addr_of!(a));
|
||||
let e = _mm_setr_ps(2., 4., 6., 8.);
|
||||
assert_eq_m128(r, e);
|
||||
}
|
||||
|
||||
#[simd_test(enable = "avxneconvert")]
|
||||
unsafe fn test_mm256_cvtneobf16_ps() {
|
||||
let a = __m256bh(
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
|
||||
);
|
||||
let r = _mm256_cvtneobf16_ps(addr_of!(a));
|
||||
let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
|
||||
assert_eq_m256(r, e);
|
||||
}
|
||||
}
|
||||
@@ -894,6 +894,9 @@ fn as_m512bh(self) -> Self {
|
||||
pub use self::f16c::*;
|
||||
|
||||
mod avx512bf16;
|
||||
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub use self::avx512bf16::*;
|
||||
|
||||
mod avxneconvert;
|
||||
#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
|
||||
pub use self::avxneconvert::*;
|
||||
|
||||
@@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
|
||||
// 2. It is a mark, indicating that the instruction will be
|
||||
// compiled into other instructions - mainly because of llvm
|
||||
// optimization.
|
||||
let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
|
||||
let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected));
|
||||
|
||||
// Look for subroutine call instructions in the disassembly to detect whether
|
||||
// inlining failed: all intrinsics are `#[inline(always)]`, so calling one
|
||||
|
||||
@@ -758,7 +758,7 @@ fn equate(
|
||||
(&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*" | "__bf16 const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
|
||||
(&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user