From f22fab559e5220dac4c81cc9fa573385e5e26c8d Mon Sep 17 00:00:00 2001
From: sayantn <sayantan.chakraborty@students.iiserpune.ac.in>
Date: Mon, 1 Jul 2024 12:41:17 +0530
Subject: [PATCH] Implemented VEX versions

Modified stdarch-test to accept VEX versions
---
 .../stdarch/crates/core_arch/missing-x86.md   |  62 --
 .../crates/core_arch/src/x86/avx512ifma.rs    | 128 +++
 .../crates/core_arch/src/x86/avx512vnni.rs    | 876 ++++++++++++++++++
 .../crates/core_arch/src/x86/avxneconvert.rs  | 188 ++++
 .../stdarch/crates/core_arch/src/x86/mod.rs   |   5 +-
 .../stdarch/crates/stdarch-test/src/lib.rs    |   2 +-
 .../crates/stdarch-verify/tests/x86-intel.rs  |   2 +-
 7 files changed, 1198 insertions(+), 65 deletions(-)
 create mode 100644 library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
diff --git a/library/stdarch/crates/core_arch/missing-x86.md b/library/stdarch/crates/core_arch/missing-x86.md
index 8d75a237a4a2..03c25d41fa43 100644
--- a/library/stdarch/crates/core_arch/missing-x86.md
+++ b/library/stdarch/crates/core_arch/missing-x86.md
@@ -1135,82 +1135,20 @@
 </p></details>
 
 
-<details><summary>["AVX_IFMA"]</summary><p>
-
-  * [ ] [`_mm256_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52hi_avx_epu64)
-  * [ ] [`_mm256_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_madd52lo_avx_epu64)
-  * [ ] [`_mm_madd52hi_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52hi_avx_epu64)
-  * [ ] [`_mm_madd52lo_avx_epu64`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_madd52lo_avx_epu64)
-</p></details>
-
-
 <details><summary>["AVX_NE_CONVERT"]</summary><p>
 
-  * [ ] [`_mm256_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
   * [ ] [`_mm256_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnesh_ps)
-  * [ ] [`_mm256_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
   * [ ] [`_mm256_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneeph_ps)
-  * [ ] [`_mm256_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
   * [ ] [`_mm256_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneoph_ps)
   * [ ] [`_mm256_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneps_avx_pbh)
-  * [ ] [`_mm_bcstnebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
   * [ ] [`_mm_bcstnesh_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnesh_ps)
-  * [ ] [`_mm_cvtneebf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
   * [ ] [`_mm_cvtneeph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneeph_ps)
-  * [ ] [`_mm_cvtneobf16_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
   * [ ] [`_mm_cvtneoph_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneoph_ps)
   * [ ] [`_mm_cvtneps_avx_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_avx_pbh)
   * [ ] [`_mm_cvtneps_pbh`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneps_pbh)
 </p></details>
 
 
-<details><summary>["AVX_VNNI"]</summary><p>
-
-  * [ ] [`_mm256_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusd_avx_epi32)
-  * [ ] [`_mm256_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbusds_avx_epi32)
-  * [ ] [`_mm256_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssd_avx_epi32)
-  * [ ] [`_mm256_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwssds_avx_epi32)
-  * [ ] [`_mm_dpbusd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusd_avx_epi32)
-  * [ ] [`_mm_dpbusds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbusds_avx_epi32)
-  * [ ] [`_mm_dpwssd_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssd_avx_epi32)
-  * [ ] [`_mm_dpwssds_avx_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwssds_avx_epi32)
-</p></details>
-
-
-<details><summary>["AVX_VNNI_INT16"]</summary><p>
-
-  * [ ] [`_mm256_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsud_epi32)
-  * [ ] [`_mm256_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwsuds_epi32)
-  * [ ] [`_mm256_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusd_epi32)
-  * [ ] [`_mm256_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwusds_epi32)
-  * [ ] [`_mm256_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuud_epi32)
-  * [ ] [`_mm256_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpwuuds_epi32)
-  * [ ] [`_mm_dpwsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsud_epi32)
-  * [ ] [`_mm_dpwsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwsuds_epi32)
-  * [ ] [`_mm_dpwusd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusd_epi32)
-  * [ ] [`_mm_dpwusds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwusds_epi32)
-  * [ ] [`_mm_dpwuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuud_epi32)
-  * [ ] [`_mm_dpwuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpwuuds_epi32)
-</p></details>
-
-
-<details><summary>["AVX_VNNI_INT8"]</summary><p>
-
-  * [ ] [`_mm256_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssd_epi32)
-  * [ ] [`_mm256_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbssds_epi32)
-  * [ ] [`_mm256_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsud_epi32)
-  * [ ] [`_mm256_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbsuds_epi32)
-  * [ ] [`_mm256_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuud_epi32)
-  * [ ] [`_mm256_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_dpbuuds_epi32)
-  * [ ] [`_mm_dpbssd_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssd_epi32)
-  * [ ] [`_mm_dpbssds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbssds_epi32)
-  * [ ] [`_mm_dpbsud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsud_epi32)
-  * [ ] [`_mm_dpbsuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbsuds_epi32)
-  * [ ] [`_mm_dpbuud_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuud_epi32)
-  * [ ] [`_mm_dpbuuds_epi32`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dpbuuds_epi32)
-</p></details>
-
-
 <details><summary>["CET_SS"]</summary><p>
 
   * [ ] [`_clrssbsy`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_clrssbsy)
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
index 01bb704ae73a..3bf9958e3da8 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512ifma.rs
@@ -114,6 +114,24 @@ pub unsafe fn _mm512_maskz_madd52lo_epu64(
     simd_select_bitmask(k, vpmadd52luq_512(a, b, c), _mm512_setzero_si512())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm256_madd52hi_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52huq_256(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
 /// unsigned integer from the intermediate result with the
@@ -169,6 +187,24 @@ pub unsafe fn _mm256_maskz_madd52hi_epu64(
     simd_select_bitmask(k, vpmadd52huq_256(a, b, c), _mm256_setzero_si256())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm256_madd52lo_avx_epu64(a: __m256i, b: __m256i, c: __m256i) -> __m256i {
+    vpmadd52luq_256(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
 /// unsigned integer from the intermediate result with the
@@ -224,6 +260,24 @@ pub unsafe fn _mm256_maskz_madd52lo_epu64(
     simd_select_bitmask(k, vpmadd52luq_256(a, b, c), _mm256_setzero_si256())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52hi_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52huq)
+)]
+pub unsafe fn _mm_madd52hi_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52huq_128(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the high 52-bit
 /// unsigned integer from the intermediate result with the
@@ -269,6 +323,24 @@ pub unsafe fn _mm_maskz_madd52hi_epu64(k: __mmask8, a: __m128i, b: __m128i, c: _
     simd_select_bitmask(k, vpmadd52huq_128(a, b, c), _mm_setzero_si128())
 }
 
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of
+/// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the
+/// corresponding unsigned 64-bit integer in `a`, and store the
+/// results in `dst`.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd52lo_avx_epu64)
+#[inline]
+#[target_feature(enable = "avxifma")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpmadd52luq)
+)]
+pub unsafe fn _mm_madd52lo_avx_epu64(a: __m128i, b: __m128i, c: __m128i) -> __m128i {
+    vpmadd52luq_128(a, b, c)
+}
+
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of
 /// `b` and `c` to form a 104-bit intermediate result. Add the low 52-bit
 /// unsigned integer from the intermediate result with the
@@ -427,6 +499,20 @@ unsafe fn test_mm512_maskz_madd52lo_epu64() {
         assert_eq_m512i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52hi_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm256_set1_epi64x(11030549757952);
+
+        assert_eq_m256i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm256_madd52hi_epu64() {
         let a = _mm256_set1_epi64x(10 << 40);
@@ -471,6 +557,20 @@ unsafe fn test_mm256_maskz_madd52hi_epu64() {
         assert_eq_m256i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm256_madd52lo_avx_epu64() {
+        let a = _mm256_set1_epi64x(10 << 40);
+        let b = _mm256_set1_epi64x((11 << 40) + 4);
+        let c = _mm256_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm256_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm256_set1_epi64x(100055558127628);
+
+        assert_eq_m256i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm256_madd52lo_epu64() {
         let a = _mm256_set1_epi64x(10 << 40);
@@ -515,6 +615,20 @@ unsafe fn test_mm256_maskz_madd52lo_epu64() {
         assert_eq_m256i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52hi_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52hi_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) >> 52)
+        let expected = _mm_set1_epi64x(11030549757952);
+
+        assert_eq_m128i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm_madd52hi_epu64() {
         let a = _mm_set1_epi64x(10 << 40);
@@ -559,6 +673,20 @@ unsafe fn test_mm_maskz_madd52hi_epu64() {
         assert_eq_m128i(expected, actual);
     }
 
+    #[simd_test(enable = "avxifma")]
+    unsafe fn test_mm_madd52lo_avx_epu64() {
+        let a = _mm_set1_epi64x(10 << 40);
+        let b = _mm_set1_epi64x((11 << 40) + 4);
+        let c = _mm_set1_epi64x((12 << 40) + 3);
+
+        let actual = _mm_madd52lo_avx_epu64(a, b, c);
+
+        // (10 << 40) + ((((11 << 40) + 4) * ((12 << 40) + 3)) % (1 << 52))
+        let expected = _mm_set1_epi64x(100055558127628);
+
+        assert_eq_m128i(expected, actual);
+    }
+
     #[simd_test(enable = "avx512ifma,avx512vl")]
     unsafe fn test_mm_madd52lo_epu64() {
         let a = _mm_set1_epi64x(10 << 40);
diff --git a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
index 67a626b7ede8..2ed800d2957c 100644
--- a/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
+++ b/library/stdarch/crates/core_arch/src/x86/avx512vnni.rs
@@ -50,6 +50,20 @@ pub unsafe fn _mm512_maskz_dpwssd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_avx_epi32&expand=2713)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm256_dpwssd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssd_epi32&expand=2216)
@@ -96,6 +110,20 @@ pub unsafe fn _mm256_maskz_dpwssd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_avx_epi32&expand=2712)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssd)
+)]
+pub unsafe fn _mm_dpwssd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssd_epi32&expand=2213)
@@ -178,6 +206,20 @@ pub unsafe fn _mm512_maskz_dpwssds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_avx_epi32&expand=2726)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm256_dpwssds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwssds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwssds_epi32&expand=2225)
@@ -224,6 +266,20 @@ pub unsafe fn _mm256_maskz_dpwssds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_avx_epi32&expand=2725)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwssds)
+)]
+pub unsafe fn _mm_dpwssds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwssds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding 16-bit integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwssds_epi32&expand=2222)
@@ -311,6 +367,20 @@ pub unsafe fn _mm512_maskz_dpbusd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_avx_epi32&expand=2683)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm256_dpbusd_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusd256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusd_epi32&expand=2198)
@@ -357,6 +427,20 @@ pub unsafe fn _mm256_maskz_dpbusd_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_avx_epi32&expand=2682)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusd)
+)]
+pub unsafe fn _mm_dpbusd_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusd128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusd_epi32&expand=2195)
@@ -439,6 +523,20 @@ pub unsafe fn _mm512_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_avx_epi32&expand=2696)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm256_dpbusds_avx_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbusds256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbusds_epi32&expand=2207)
@@ -485,6 +583,20 @@ pub unsafe fn _mm256_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_avx_epi32&expand=2695)
+#[inline]
+#[target_feature(enable = "avxvnni")]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbusds)
+)]
+pub unsafe fn _mm_dpbusds_avx_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbusds128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding signed 8-bit integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding 32-bit integer in src using signed saturation, and store the packed 32-bit results in dst.
 ///
 /// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbusds_epi32&expand=2204)
@@ -526,6 +638,390 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
     transmute(simd_select_bitmask(k, r, zero))
 }
 
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssd_epi32&expand=2674)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbssd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssd_epi32&expand=2675)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbssd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbssds_epi32&expand=2676)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbssds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbssds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding signed 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbssds_epi32&expand=2677)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbssds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbssds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbssds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsud_epi32&expand=2678)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsud_epi32&expand=2679)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbsuds_epi32&expand=2680)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbsuds_epi32&expand=2681)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuud_epi32&expand=2708)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuud_epi32&expand=2709)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpbuuds_epi32&expand=2710)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpbuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpbuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in a with corresponding unsigned 8-bit
+/// integers in b, producing 4 intermediate signed 16-bit results. Sum these 4 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpbuuds_epi32&expand=2711)
+#[inline]
+#[target_feature(enable = "avxvnniint8")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpbuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpbuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpbuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsud_epi32&expand=2738)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwsud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsud_epi32&expand=2739)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwsud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwsuds_epi32&expand=2740)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwsuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwsuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwsuds_epi32&expand=2741)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwsuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwsuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwsuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusd_epi32&expand=2742)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusd_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwusd_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusd_epi32&expand=2743)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusd)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusd_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwusd_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwusds_epi32&expand=2744)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwusds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwusds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding signed 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwusds_epi32&expand=2745)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwusds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwusds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwusds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuud_epi32&expand=2746)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuud_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwuud_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuud_epi32&expand=2747)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuud)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuud_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwuud_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dpwuuds_epi32&expand=2748)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_dpwuuds_epi32(src: __m128i, a: __m128i, b: __m128i) -> __m128i {
+    transmute(vpdpwuuds_128(src.as_i32x4(), a.as_i32x4(), b.as_i32x4()))
+}
+
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in a with corresponding unsigned 16-bit
+/// integers in b, producing 2 intermediate signed 32-bit results. Sum these 2 results with the corresponding
+/// 32-bit integer in src with signed saturation, and store the packed 32-bit results in dst.
+///
+/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_dpwuuds_epi32&expand=2749)
+#[inline]
+#[target_feature(enable = "avxvnniint16")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vpdpwuuds)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_dpwuuds_epi32(src: __m256i, a: __m256i, b: __m256i) -> __m256i {
+    transmute(vpdpwuuds_256(src.as_i32x8(), a.as_i32x8(), b.as_i32x8()))
+}
+
 #[allow(improper_ctypes)]
 extern "C" {
     #[link_name = "llvm.x86.avx512.vpdpwssd.512"]
@@ -555,6 +1051,66 @@ pub unsafe fn _mm_maskz_dpbusds_epi32(
     fn vpdpbusds256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
     #[link_name = "llvm.x86.avx512.vpdpbusds.128"]
     fn vpdpbusds128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssd.128"]
+    fn vpdpbssd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssd.256"]
+    fn vpdpbssd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbssds.128"]
+    fn vpdpbssds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbssds.256"]
+    fn vpdpbssds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsud.128"]
+    fn vpdpbsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsud.256"]
+    fn vpdpbsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.128"]
+    fn vpdpbsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbsuds.256"]
+    fn vpdpbsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuud.128"]
+    fn vpdpbuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuud.256"]
+    fn vpdpbuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.128"]
+    fn vpdpbuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpbuuds.256"]
+    fn vpdpbuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsud.128"]
+    fn vpdpwsud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsud.256"]
+    fn vpdpwsud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.128"]
+    fn vpdpwsuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwsuds.256"]
+    fn vpdpwsuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusd.128"]
+    fn vpdpwusd_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusd.256"]
+    fn vpdpwusd_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwusds.128"]
+    fn vpdpwusds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwusds.256"]
+    fn vpdpwusds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuud.128"]
+    fn vpdpwuud_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuud.256"]
+    fn vpdpwuud_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
+
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.128"]
+    fn vpdpwuuds_128(src: i32x4, a: i32x4, b: i32x4) -> i32x4;
+    #[link_name = "llvm.x86.avx2.vpdpwuuds.256"]
+    fn vpdpwuuds_256(src: i32x8, a: i32x8, b: i32x8) -> i32x8;
 }
 
 #[cfg(test)]
@@ -597,6 +1153,16 @@ unsafe fn test_mm512_maskz_dpwssd_epi32() {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpwssd_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -631,6 +1197,16 @@ unsafe fn test_mm256_maskz_dpwssd_epi32() {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpwssd_epi32() {
         let src = _mm_set1_epi32(1);
@@ -699,6 +1275,16 @@ unsafe fn test_mm512_maskz_dpwssds_epi32() {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpwssds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwssds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpwssds_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -733,6 +1319,16 @@ unsafe fn test_mm256_maskz_dpwssds_epi32() {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpwssds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwssds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpwssds_epi32() {
         let src = _mm_set1_epi32(1);
@@ -801,6 +1397,16 @@ unsafe fn test_mm512_maskz_dpbusd_epi32() {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusd_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusd_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpbusd_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -835,6 +1441,16 @@ unsafe fn test_mm256_maskz_dpbusd_epi32() {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusd_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusd_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpbusd_epi32() {
         let src = _mm_set1_epi32(1);
@@ -903,6 +1519,16 @@ unsafe fn test_mm512_maskz_dpbusds_epi32() {
         assert_eq_m512i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm256_dpbusds_avx_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbusds_avx_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm256_dpbusds_epi32() {
         let src = _mm256_set1_epi32(1);
@@ -937,6 +1563,16 @@ unsafe fn test_mm256_maskz_dpbusds_epi32() {
         assert_eq_m256i(r, e);
     }
 
+    #[simd_test(enable = "avxvnni")]
+    unsafe fn test_mm_dpbusds_avx_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbusds_avx_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
     #[simd_test(enable = "avx512vnni,avx512vl")]
     unsafe fn test_mm_dpbusds_epi32() {
         let src = _mm_set1_epi32(1);
@@ -970,4 +1606,244 @@ unsafe fn test_mm_maskz_dpbusds_epi32() {
         let e = _mm_set1_epi32(5);
         assert_eq_m128i(r, e);
     }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssd_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbssds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbssds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbssds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbssds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm_dpbuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm_dpbuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(5);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint8")]
+    unsafe fn test_mm256_dpbuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 24 | 1 << 16 | 1 << 8 | 1 << 0);
+        let r = _mm256_dpbuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(5);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwsuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwsuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwsuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwsuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusd_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusd_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusd_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusd_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwusds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwusds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwusds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwusds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuud_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuud_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuud_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuud_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm_dpwuuds_epi32() {
+        let src = _mm_set1_epi32(1);
+        let a = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm_dpwuuds_epi32(src, a, b);
+        let e = _mm_set1_epi32(3);
+        assert_eq_m128i(r, e);
+    }
+
+    #[simd_test(enable = "avxvnniint16")]
+    unsafe fn test_mm256_dpwuuds_epi32() {
+        let src = _mm256_set1_epi32(1);
+        let a = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let b = _mm256_set1_epi32(1 << 16 | 1 << 0);
+        let r = _mm256_dpwuuds_epi32(src, a, b);
+        let e = _mm256_set1_epi32(3);
+        assert_eq_m256i(r, e);
+    }
 }
diff --git a/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
new file mode 100644
index 000000000000..1d29cfc0f59a
--- /dev/null
+++ b/library/stdarch/crates/core_arch/src/x86/avxneconvert.rs
@@ -0,0 +1,188 @@
+use crate::core_arch::{simd::*, x86::*};
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit)
+/// floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_bcstnebf16_ps(a: *const u16) -> __m128 {
+    transmute(bcstnebf162ps_128(a))
+}
+
+/// Convert scalar BF16 (16-bit) floating point element stored at memory locations starting at location
+/// a to single precision (32-bit) floating-point, broadcast it to packed single precision (32-bit) floating-point
+/// elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_bcstnebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vbcstnebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_bcstnebf16_ps(a: *const u16) -> __m256 {
+    transmute(bcstnebf162ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneebf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneebf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneebf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneebf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneebf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneebf162ps_256(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm_cvtneobf16_ps(a: *const __m128bh) -> __m128 {
+    transmute(cvtneobf162ps_128(a))
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements stored at memory locations starting at
+/// location a to single precision (32-bit) floating-point elements, and store the results in dst.
+///
+/// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_cvtneobf16_ps)
+#[inline]
+#[target_feature(enable = "avxneconvert")]
+#[cfg_attr(
+    all(test, any(target_os = "linux", target_env = "msvc")),
+    assert_instr(vcvtneobf162ps)
+)]
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub unsafe fn _mm256_cvtneobf16_ps(a: *const __m256bh) -> __m256 {
+    transmute(cvtneobf162ps_256(a))
+}
+
+#[allow(improper_ctypes)]
+extern "C" {
+    #[link_name = "llvm.x86.vbcstnebf162ps128"]
+    fn bcstnebf162ps_128(a: *const u16) -> f32x4;
+    #[link_name = "llvm.x86.vbcstnebf162ps256"]
+    fn bcstnebf162ps_256(a: *const u16) -> f32x8;
+
+    #[link_name = "llvm.x86.vcvtneebf162ps128"]
+    fn cvtneebf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneebf162ps256"]
+    fn cvtneebf162ps_256(a: *const __m256bh) -> __m256;
+
+    #[link_name = "llvm.x86.vcvtneobf162ps128"]
+    fn cvtneobf162ps_128(a: *const __m128bh) -> __m128;
+    #[link_name = "llvm.x86.vcvtneobf162ps256"]
+    fn cvtneobf162ps_256(a: *const __m256bh) -> __m256;
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::core_arch::x86::*;
+    use std::ptr::addr_of;
+    use stdarch_test::simd_test;
+
+    const BF16_ONE: u16 = 0b0_01111111_0000000;
+    const BF16_TWO: u16 = 0b0_10000000_0000000;
+    const BF16_THREE: u16 = 0b0_10000000_1000000;
+    const BF16_FOUR: u16 = 0b0_10000001_0000000;
+    const BF16_FIVE: u16 = 0b0_10000001_0100000;
+    const BF16_SIX: u16 = 0b0_10000001_1000000;
+    const BF16_SEVEN: u16 = 0b0_10000001_1100000;
+    const BF16_EIGHT: u16 = 0b0_10000010_0000000;
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_bcstnebf16_ps() {
+        let a = BF16_ONE;
+        let r = _mm_bcstnebf16_ps(addr_of!(a));
+        let e = _mm_set_ps(1., 1., 1., 1.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_bcstnebf16_ps() {
+        let a = BF16_ONE;
+        let r = _mm256_bcstnebf16_ps(addr_of!(a));
+        let e = _mm256_set_ps(1., 1., 1., 1., 1., 1., 1., 1.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneebf16_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm_cvtneebf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(1., 3., 5., 7.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneebf16_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm256_cvtneebf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(1., 3., 5., 7., 1., 3., 5., 7.);
+        assert_eq_m256(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm_cvtneobf16_ps() {
+        let a = __m128bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm_cvtneobf16_ps(addr_of!(a));
+        let e = _mm_setr_ps(2., 4., 6., 8.);
+        assert_eq_m128(r, e);
+    }
+
+    #[simd_test(enable = "avxneconvert")]
+    unsafe fn test_mm256_cvtneobf16_ps() {
+        let a = __m256bh(
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+            BF16_ONE, BF16_TWO, BF16_THREE, BF16_FOUR, BF16_FIVE, BF16_SIX, BF16_SEVEN, BF16_EIGHT,
+        );
+        let r = _mm256_cvtneobf16_ps(addr_of!(a));
+        let e = _mm256_setr_ps(2., 4., 6., 8., 2., 4., 6., 8.);
+        assert_eq_m256(r, e);
+    }
+}
diff --git a/library/stdarch/crates/core_arch/src/x86/mod.rs b/library/stdarch/crates/core_arch/src/x86/mod.rs
index 8b1d3bbbb6af..6f8c51c16a98 100644
--- a/library/stdarch/crates/core_arch/src/x86/mod.rs
+++ b/library/stdarch/crates/core_arch/src/x86/mod.rs
@@ -894,6 +894,9 @@ fn as_m512bh(self) -> Self {
 pub use self::f16c::*;
 
 mod avx512bf16;
-
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 pub use self::avx512bf16::*;
+
+mod avxneconvert;
+#[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
+pub use self::avxneconvert::*;
diff --git a/library/stdarch/crates/stdarch-test/src/lib.rs b/library/stdarch/crates/stdarch-test/src/lib.rs
index a8c2d36e113d..a2835e3b0c4e 100644
--- a/library/stdarch/crates/stdarch-test/src/lib.rs
+++ b/library/stdarch/crates/stdarch-test/src/lib.rs
@@ -84,7 +84,7 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) {
     // 2. It is a mark, indicating that the instruction will be
     // compiled into other instructions - mainly because of llvm
     // optimization.
-    let found = expected == "nop" || instrs.iter().any(|s| s.starts_with(expected));
+    let found = expected == "nop" || instrs.iter().any(|s| s.contains(expected));
 
     // Look for subroutine call instructions in the disassembly to detect whether
     // inlining failed: all intrinsics are `#[inline(always)]`, so calling one
diff --git a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
index 15d2454f4304..f12fa5f0eed6 100644
--- a/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
+++ b/library/stdarch/crates/stdarch-verify/tests/x86-intel.rs
@@ -758,7 +758,7 @@ fn equate(
         (&Type::ConstPtr(&Type::PrimSigned(8)), "char const*") => {}
         (&Type::ConstPtr(&Type::PrimSigned(32)), "__int32 const*" | "int const*") => {}
         (&Type::ConstPtr(&Type::PrimSigned(64)), "__int64 const*") => {}
-        (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*") => {}
+        (&Type::ConstPtr(&Type::PrimUnsigned(16)), "unsigned short const*" | "__bf16 const*") => {}
         (&Type::ConstPtr(&Type::PrimUnsigned(32)), "unsigned int const*") => {}
         (&Type::ConstPtr(&Type::PrimUnsigned(64)), "unsigned __int64 const*") => {}