Merge pull request #2026 from folkertdev/neon-vpadd

use `intrinsics::simd` for vpadd
2026-04-27 18:57:42 +03:00 · 2026-02-18 16:18:15 +00:00
parent 9ce60760aa 6671f95770
commit 244fc326f2
3 changed files with 111 additions and 177 deletions
@@ -16025,14 +16025,11 @@ pub fn vpaddd_u64(a: uint64x2_t) -> u64 {
 #[cfg(not(target_arch = "arm64ec"))]
 #[cfg_attr(test, assert_instr(faddp))]
 pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.faddp.v8f16"
-        )]
-        fn _vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_f16(a, b) }
 }
 #[doc = "Floating-point add pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f32)"]
@@ -16041,14 +16038,11 @@ pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(faddp))]
 pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.faddp.v4f32"
-        )]
-        fn _vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_f32(a, b) }
 }
 #[doc = "Floating-point add pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f64)"]
@@ -16057,14 +16051,11 @@ pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(faddp))]
 pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.faddp.v2f64"
-        )]
-        fn _vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_f64(a, b) }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s8)"]
@@ -16073,14 +16064,11 @@ pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.addp.v16i8"
-        )]
-        fn _vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_s8(a, b) }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s16)"]
@@ -16089,14 +16077,11 @@ pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.addp.v8i16"
-        )]
-        fn _vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_s16(a, b) }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s32)"]
@@ -16105,14 +16090,11 @@ pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.addp.v4i32"
-        )]
-        fn _vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
+        simd_add(even, odd)
    }
-    unsafe { _vpaddq_s32(a, b) }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s64)"]
@@ -16121,119 +16103,62 @@ pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
-    unsafe extern "unadjusted" {
-        #[cfg_attr(
-            any(target_arch = "aarch64", target_arch = "arm64ec"),
-            link_name = "llvm.aarch64.neon.addp.v2i64"
-        )]
-        fn _vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
-    }
-    unsafe { _vpaddq_s64(a, b) }
-}
-#[doc = "Add Pairwise"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(addp))]
-pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    unsafe { transmute(vpaddq_s8(transmute(a), transmute(b))) }
-}
-#[doc = "Add Pairwise"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(addp))]
-pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
-    let a: uint8x16_t =
-        unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
-    let b: uint8x16_t =
-        unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
    unsafe {
-        let ret_val: uint8x16_t = transmute(vpaddq_s8(transmute(a), transmute(b)));
-        simd_shuffle!(
-            ret_val,
-            ret_val,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
+        simd_add(even, odd)
+    }
+}
+#[doc = "Add Pairwise"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
+#[inline(always)]
+#[target_feature(enable = "neon")]
+#[stable(feature = "neon_intrinsics", since = "1.59.0")]
+#[cfg_attr(test, assert_instr(addp))]
+pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
+    unsafe {
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
+        simd_add(even, odd)
    }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    unsafe { transmute(vpaddq_s16(transmute(a), transmute(b))) }
-}
-#[doc = "Add Pairwise"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(addp))]
-pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
-    let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
    unsafe {
-        let ret_val: uint16x8_t = transmute(vpaddq_s16(transmute(a), transmute(b)));
-        simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
+        simd_add(even, odd)
    }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    unsafe { transmute(vpaddq_s32(transmute(a), transmute(b))) }
-}
-#[doc = "Add Pairwise"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(addp))]
-pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
-    let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
-    let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
    unsafe {
-        let ret_val: uint32x4_t = transmute(vpaddq_s32(transmute(a), transmute(b)));
-        simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
+        simd_add(even, odd)
    }
 }
 #[doc = "Add Pairwise"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(test, assert_instr(addp))]
 pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    unsafe { transmute(vpaddq_s64(transmute(a), transmute(b))) }
-}
-#[doc = "Add Pairwise"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[stable(feature = "neon_intrinsics", since = "1.59.0")]
-#[cfg_attr(test, assert_instr(addp))]
-pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
-    let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
-    let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
    unsafe {
-        let ret_val: uint64x2_t = transmute(vpaddq_s64(transmute(a), transmute(b)));
-        simd_shuffle!(ret_val, ret_val, [1, 0])
+        let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
+        let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
+        simd_add(even, odd)
    }
 }
 #[doc = "Floating-point add pairwise"]
@@ -187,9 +187,31 @@ macro_rules! simd_masked_store {
    };
 }

+/// The first N even indices `[0, 2, 4, ...]`.
+pub(crate) const fn even<const N: usize>() -> [u32; N] {
+    let mut out = [0u32; N];
+    let mut i = 0usize;
+    while i < N {
+        out[i] = (2 * i) as u32;
+        i += 1;
+    }
+    out
+}
+
+/// The first N odd indices `[1, 3, 5, ...]`.
+pub(crate) const fn odd<const N: usize>() -> [u32; N] {
+    let mut out = [0u32; N];
+    let mut i = 0usize;
+    while i < N {
+        out[i] = (2 * i + 1) as u32;
+        i += 1;
+    }
+    out
+}
+
+/// Multiples of N offset by K `[K, K+N, K+2N, ...]`.
 pub(crate) const fn deinterleave_mask<const LANES: usize, const N: usize, const K: usize>()
 -> [u32; LANES] {
-    // Produces: [K, K+N, K+2N, ...]
    let mut out = [0u32; LANES];
    let mut i = 0usize;
    while i < LANES {
@@ -6962,28 +6962,29 @@ intrinsics:
          - FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
      - FnCall: ["vmovl{neon_type[0].noq}", [a]]

-  - name: "vpadd{neon_type.no}"
-    doc: Floating-point add pairwise
-    arguments: ["a: {neon_type}", "b: {neon_type}"]
-    return_type: "{type}"
+  - name: "vpadd{neon_type[0].no}"
+    doc: "Floating-point add pairwise"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
    attr: [*neon-stable]
    assert_instr: [faddp]
    safety: safe
    types:
-      - float32x4_t
-      - float64x2_t
+      - [float32x4_t, "4"]
+      - [float64x2_t, "2"]
    compose:
-      - LLVMLink:
-          name: "faddp.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.faddp.{neon_type}"
-              arch: aarch64,arm64ec
+      - Let:
+        - even
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
+      - Let:
+        - odd
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
+      - FnCall: [simd_add, [even, odd]]

-
-  - name: "vpadd{neon_type.no}"
+  - name: "vpadd{neon_type[0].no}"
    doc: Floating-point add pairwise
-    arguments: ["a: {neon_type}", "b: {neon_type}"]
-    return_type: "{type}"
+    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
+    return_type: "{neon_type[0]}"
    attr:
      - *neon-fp16
      - *neon-stable-fp16
@@ -6991,14 +6992,15 @@ intrinsics:
    assert_instr: [faddp]
    safety: safe
    types:
-      - float16x8_t
+      - [float16x8_t, "8"]
    compose:
-      - LLVMLink:
-          name: "faddp.{neon_type}"
-          links:
-            - link: "llvm.aarch64.neon.faddp.{neon_type}"
-              arch: aarch64,arm64ec
-
+      - Let:
+        - even
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
+      - Let:
+        - odd
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
+      - FnCall: [simd_add, [even, odd]]

  - name: "vpmax{neon_type.no}"
    doc: Floating-point add pairwise
@@ -13236,26 +13238,6 @@ intrinsics:
            - link: "llvm.aarch64.neon.usqadd.{neon_type[1]}"
              arch: aarch64,arm64ec

-  - name: "vpadd{neon_type.no}"
-    doc: "Add Pairwise"
-    arguments: ["a: {neon_type}", "b: {neon_type}"]
-    return_type: "{neon_type}"
-    attr:
-      - *neon-stable
-    assert_instr: [addp]
-    safety: safe
-    types:
-      - int8x16_t
-      - int16x8_t
-      - int32x4_t
-      - int64x2_t
-    compose:
-      - LLVMLink:
-          name: "vpadd{neon_type.no}"
-          links:
-            - link: "llvm.aarch64.neon.addp.{neon_type}"
-              arch: aarch64,arm64ec
-
  - name: "vpadd{neon_type[0].no}"
    doc: "Add Pairwise"
    arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
@@ -13265,17 +13247,22 @@ intrinsics:
    assert_instr: [addp]
    safety: safe
    types:
-      - [uint8x16_t, int8x16_t]
-      - [uint16x8_t, int16x8_t]
-      - [uint32x4_t, int32x4_t]
-      - [uint64x2_t, int64x2_t]
+      - [int8x16_t, "16"]
+      - [int16x8_t, "8"]
+      - [int32x4_t, "4"]
+      - [int64x2_t, "2"]
+      - [uint8x16_t, "16"]
+      - [uint16x8_t, "8"]
+      - [uint32x4_t, "4"]
+      - [uint64x2_t, "2"]
    compose:
-      - FnCall:
-          - transmute
-          - - FnCall:
-                - 'vpadd{neon_type[1].no}'
-                - - FnCall: [transmute, [a]]
-                  - FnCall: [transmute, [b]]
+      - Let:
+        - even
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
+      - Let:
+        - odd
+        - FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
+      - FnCall: [simd_add, [even, odd]]

  - name: "vpaddd_s64"
    doc: "Add pairwise"