From f22fab559e5220dac4c81cc9fa573385e5e26c8d Mon Sep 17 00:00:00 2001
From: sayantn
- - * [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64) - * [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64) - * [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64) - * [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64) -
- * [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) * [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps) - * [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) * [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps) - * [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) * [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps) * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh) - * [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) * [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps) - * [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) * [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps) - * [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) * [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps) * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh) * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
- - * [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32) - * [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32) - * [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32) - * [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32) - * [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32) - * [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32) - * [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32) - * [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32) -
- - * [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32) - * [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32) - * [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32) - * [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32) - * [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32) - * [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32) - * [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32) - * [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32) - * [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32) - * [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32) - * [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32) - * [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32) -
- - * [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32) - * [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32) - * [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32) - * [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32) - * [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32) - * [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32) - * [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32) - * [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32) - * [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32) - * [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32) - * [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32) - * [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32) -
* [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy) diff --git a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs index 01bb704ae73a..3bf9958e3da8 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs @@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64( simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52huq) +)] +pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52huq_256(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the @@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64( simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52luq) +)] +pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i { + vpmadd52luq_256(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the @@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64( simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52huq) +)] +pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52huq_128(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit /// unsigned integer from the intermediate result with the @@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _ simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128()) } +/// Multiply packed unsigned 52-bit integers in each 64-bit element of +/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit +/// unsigned integer from the intermediate result with the +/// corresponding unsigned 64-bit integer in `a`, and store the +/// results in `dst`. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64) +#[inline] +#[target_feature(enable = "avxifma")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpmadd52luq) +)] +pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i { + vpmadd52luq_128(a, b, c) +} + /// Multiply packed unsigned 52-bit integers in each 64-bit element of /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit /// unsigned integer from the intermediate result with the @@ -427,6 +499,20 @@ unsafe fn test_mm512_maskz_madd52lo_epu64() { assert_eq_m512i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm256_madd52hi_avx_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_madd52hi_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm256_set1_epi64x(11030549757952); + + assert_eq_m256i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52hi_epu64() { let a = _mm256_set1_epi64x(10 << 40); @@ -471,6 +557,20 @@ unsafe fn test_mm256_maskz_madd52hi_epu64() { assert_eq_m256i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm256_madd52lo_avx_epu64() { + let a = _mm256_set1_epi64x(10 << 40); + let b = _mm256_set1_epi64x((11 << 40) + 4); + let c = _mm256_set1_epi64x((12 << 40) + 3); + + let actual = _mm256_madd52lo_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm256_set1_epi64x(100055558127628); + + assert_eq_m256i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm256_madd52lo_epu64() { let a = _mm256_set1_epi64x(10 << 40); @@ -515,6 +615,20 @@ unsafe fn test_mm256_maskz_madd52lo_epu64() { assert_eq_m256i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm_madd52hi_avx_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_madd52hi_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52) + let expected = _mm_set1_epi64x(11030549757952); + + assert_eq_m128i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm_madd52hi_epu64() { let a = _mm_set1_epi64x(10 << 40); @@ -559,6 +673,20 @@ unsafe fn test_mm_maskz_madd52hi_epu64() { assert_eq_m128i(expected, actual); } + #[simd_test(enable = "avxifma")] + unsafe fn test_mm_madd52lo_avx_epu64() { + let a = _mm_set1_epi64x(10 << 40); + let b = _mm_set1_epi64x((11 << 40) + 4); + let c = _mm_set1_epi64x((12 << 40) + 3); + + let actual = _mm_madd52lo_avx_epu64(a, b, c); + + // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52)) + let expected = _mm_set1_epi64x(100055558127628); + + assert_eq_m128i(expected, actual); + } + #[simd_test(enable = "avx512ifma,avx512vl")] unsafe fn test_mm_madd52lo_epu64() { let a = _mm_set1_epi64x(10 << 40); diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs index 67a626b7ede8..2ed800d2957c 100644 --- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs +++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs @@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssd) +)] +pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216) @@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssd) +)] +pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213) @@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssds) +)] +pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225) @@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwssds) +)] +pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222) @@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusd) +)] +pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198) @@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusd) +)] +pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195) @@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusds) +)] +pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207) @@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695) +#[inline] +#[target_feature(enable = "avxvnni")] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbusds) +)] +pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst. /// /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204) @@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32( transmute(simd_select_bitmask(k, r, zero)) } +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbssds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit +/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711) +#[inline] +#[target_feature(enable = "avxvnniint8")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpbuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwsuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusd) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwusds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuud) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i { + transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4())) +} + +/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit +/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding +/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst. +/// +/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749) +#[inline] +#[target_feature(enable = "avxvnniint16")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vpdpwuuds) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i { + transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8())) +} + #[allow(improper_ctypes)] extern "C" { #[link_name = "llvm.x86.avx512.vpdpwssd.512"] @@ -555,6 +1051,66 @@ pub unsafe fn _mm_maskz_dpbusds_epi32( fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; #[link_name = "llvm.x86.avx512.vpdpbusds.128"] fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + + #[link_name = "llvm.x86.avx2.vpdpbssd.128"] + fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbssd.256"] + fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbssds.128"] + fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbssds.256"] + fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbsud.128"] + fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbsud.256"] + fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbsuds.128"] + fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbsuds.256"] + fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbuud.128"] + fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbuud.256"] + fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpbuuds.128"] + fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpbuuds.256"] + fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwsud.128"] + fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwsud.256"] + fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwsuds.128"] + fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwsuds.256"] + fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwusd.128"] + fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwusd.256"] + fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwusds.128"] + fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwusds.256"] + fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwuud.128"] + fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwuud.256"] + fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; + + #[link_name = "llvm.x86.avx2.vpdpwuuds.128"] + fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4; + #[link_name = "llvm.x86.avx2.vpdpwuuds.256"] + fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8; } #[cfg(test)] @@ -597,6 +1153,16 @@ unsafe fn test_mm512_maskz_dpwssd_epi32() { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpwssd_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwssd_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpwssd_epi32() { let src = _mm256_set1_epi32(1); @@ -631,6 +1197,16 @@ unsafe fn test_mm256_maskz_dpwssd_epi32() { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpwssd_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwssd_avx_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpwssd_epi32() { let src = _mm_set1_epi32(1); @@ -699,6 +1275,16 @@ unsafe fn test_mm512_maskz_dpwssds_epi32() { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpwssds_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwssds_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpwssds_epi32() { let src = _mm256_set1_epi32(1); @@ -733,6 +1319,16 @@ unsafe fn test_mm256_maskz_dpwssds_epi32() { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpwssds_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwssds_avx_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpwssds_epi32() { let src = _mm_set1_epi32(1); @@ -801,6 +1397,16 @@ unsafe fn test_mm512_maskz_dpbusd_epi32() { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpbusd_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbusd_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpbusd_epi32() { let src = _mm256_set1_epi32(1); @@ -835,6 +1441,16 @@ unsafe fn test_mm256_maskz_dpbusd_epi32() { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpbusd_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbusd_avx_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpbusd_epi32() { let src = _mm_set1_epi32(1); @@ -903,6 +1519,16 @@ unsafe fn test_mm512_maskz_dpbusds_epi32() { assert_eq_m512i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm256_dpbusds_avx_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbusds_avx_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm256_dpbusds_epi32() { let src = _mm256_set1_epi32(1); @@ -937,6 +1563,16 @@ unsafe fn test_mm256_maskz_dpbusds_epi32() { assert_eq_m256i(r, e); } + #[simd_test(enable = "avxvnni")] + unsafe fn test_mm_dpbusds_avx_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbusds_avx_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + #[simd_test(enable = "avx512vnni,avx512vl")] unsafe fn test_mm_dpbusds_epi32() { let src = _mm_set1_epi32(1); @@ -970,4 +1606,244 @@ unsafe fn test_mm_maskz_dpbusds_epi32() { let e = _mm_set1_epi32(5); assert_eq_m128i(r, e); } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbssd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbssd_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbssd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbssd_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbssds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbssds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbssds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbssds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbsud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbsud_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbsud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbsud_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbsuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbsuds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbsuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbsuds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbuud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbuud_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbuud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbuud_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm_dpbuuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm_dpbuuds_epi32(src, a, b); + let e = _mm_set1_epi32(5); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint8")] + unsafe fn test_mm256_dpbuuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0); + let r = _mm256_dpbuuds_epi32(src, a, b); + let e = _mm256_set1_epi32(5); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwsud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwsud_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwsud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwsud_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwsuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwsuds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwsuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwsuds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwusd_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwusd_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwusd_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwusd_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwusds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwusds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwusds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwusds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwuud_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwuud_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwuud_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwuud_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm_dpwuuds_epi32() { + let src = _mm_set1_epi32(1); + let a = _mm_set1_epi32(1 << 16 | 1 << 0); + let b = _mm_set1_epi32(1 << 16 | 1 << 0); + let r = _mm_dpwuuds_epi32(src, a, b); + let e = _mm_set1_epi32(3); + assert_eq_m128i(r, e); + } + + #[simd_test(enable = "avxvnniint16")] + unsafe fn test_mm256_dpwuuds_epi32() { + let src = _mm256_set1_epi32(1); + let a = _mm256_set1_epi32(1 << 16 | 1 << 0); + let b = _mm256_set1_epi32(1 << 16 | 1 << 0); + let r = _mm256_dpwuuds_epi32(src, a, b); + let e = _mm256_set1_epi32(3); + assert_eq_m256i(r, e); + } } diff --git a/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs new file mode 100644 index 000000000000..1d29cfc0f59a --- /dev/null +++ b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs @@ -0,0 +1,188 @@ +use crate::core_arch::{simd::*, x86::*}; + +#[cfg(test)] +use stdarch_test::assert_instr; + +/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location +/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) +/// floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vbcstnebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 { + transmute(bcstnebf162ps_128(a)) +} + +/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location +/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point +/// elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vbcstnebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_bcstnebf16_ps(a: *const u16) -> __m256 { + transmute(bcstnebf162ps_256(a)) +} + +/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 { + transmute(cvtneebf162ps_128(a)) +} + +/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneebf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 { + transmute(cvtneebf162ps_256(a)) +} + +/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneobf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 { + transmute(cvtneobf162ps_128(a)) +} + +/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at +/// location a to single precision (32-bit) floating-point elements, and store the results in dst. +/// +/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps) +#[inline] +#[target_feature(enable = "avxneconvert")] +#[cfg_attr( + all(test, any(target_os = "linux", target_env = "msvc")), + assert_instr(vcvtneobf162ps) +)] +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 { + transmute(cvtneobf162ps_256(a)) +} + +#[allow(improper_ctypes)] +extern "C" { + #[link_name = "llvm.x86.vbcstnebf162ps128"] + fn bcstnebf162ps_128(a: *const u16) -> f32x4; + #[link_name = "llvm.x86.vbcstnebf162ps256"] + fn bcstnebf162ps_256(a: *const u16) -> f32x8; + + #[link_name = "llvm.x86.vcvtneebf162ps128"] + fn cvtneebf162ps_128(a: *const __m128bh) -> __m128; + #[link_name = "llvm.x86.vcvtneebf162ps256"] + fn cvtneebf162ps_256(a: *const __m256bh) -> __m256; + + #[link_name = "llvm.x86.vcvtneobf162ps128"] + fn cvtneobf162ps_128(a: *const __m128bh) -> __m128; + #[link_name = "llvm.x86.vcvtneobf162ps256"] + fn cvtneobf162ps_256(a: *const __m256bh) -> __m256; +} + +#[cfg(test)] +mod tests { + use crate::core_arch::x86::*; + use std::ptr::addr_of; + use stdarch_test::simd_test; + + const BF16_ONE: u16 = 0b0_01111111_0000000; + const BF16_TWO: u16 = 0b0_10000000_0000000; + const BF16_THREE: u16 = 0b0_10000000_1000000; + const BF16_FOUR: u16 = 0b0_10000001_0000000; + const BF16_FIVE: u16 = 0b0_10000001_0100000; + const BF16_SIX: u16 = 0b0_10000001_1000000; + const BF16_SEVEN: u16 = 0b0_10000001_1100000; + const BF16_EIGHT: u16 = 0b0_10000010_0000000; + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_bcstnebf16_ps() { + let a = BF16_ONE; + let r = _mm_bcstnebf16_ps(addr_of!(a)); + let e = _mm_set_ps(1., 1., 1., 1.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_bcstnebf16_ps() { + let a = BF16_ONE; + let r = _mm256_bcstnebf16_ps(addr_of!(a)); + let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_cvtneebf16_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm_cvtneebf16_ps(addr_of!(a)); + let e = _mm_setr_ps(1., 3., 5., 7.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_cvtneebf16_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm256_cvtneebf16_ps(addr_of!(a)); + let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.); + assert_eq_m256(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm_cvtneobf16_ps() { + let a = __m128bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm_cvtneobf16_ps(addr_of!(a)); + let e = _mm_setr_ps(2., 4., 6., 8.); + assert_eq_m128(r, e); + } + + #[simd_test(enable = "avxneconvert")] + unsafe fn test_mm256_cvtneobf16_ps() { + let a = __m256bh( + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT, + ); + let r = _mm256_cvtneobf16_ps(addr_of!(a)); + let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.); + assert_eq_m256(r, e); + } +} diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs index 8b1d3bbbb6af..6f8c51c16a98 100644 --- a/library/stdarch/crates/core_arch/src/x86/mod.rs +++ b/library/stdarch/crates/core_arch/src/x86/mod.rs @@ -894,6 +894,9 @@ fn as_m512bh(self) -> Self { pub use self::f16c::*; mod avx512bf16; - #[unstable(feature = "stdarch_x86_avx512", issue = "111137")] pub use self::avx512bf16::*; + +mod avxneconvert; +#[unstable(feature = "stdarch_x86_avx512", issue = "111137")] +pub use self::avxneconvert::*; diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs index a8c2d36e113d..a2835e3b0c4e 100644 --- a/library/stdarch/crates/stdarch-test/src/lib.rs +++ b/library/stdarch/crates/stdarch-test/src/lib.rs @@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { // 2. It is a mark, indicating that the instruction will be // compiled into other instructions - mainly because of llvm // optimization. - let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected)); + let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected)); // Look for subroutine call instructions in the disassembly to detect whether // inlining failed: all intrinsics are `#[inline(always)]`, so calling one diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs index 15d2454f4304..f12fa5f0eed6 100644 --- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs +++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs @@ -758,7 +758,7 @@ fn equate( (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {} (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {} (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {} - (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {} + (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*" | "__bf16 const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {} (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}