Merge pull request #2026 from folkertdev/neon-vpadd

use `intrinsics::simd` for vpadd
This commit is contained in:
Folkert de Vries
2026-02-18 16:18:15 +00:00
committed by GitHub
3 changed files with 111 additions and 177 deletions
@@ -16025,14 +16025,11 @@ pub fn vpaddd_u64(a: uint64x2_t) -> u64 {
#[cfg(not(target_arch = "arm64ec"))]
#[cfg_attr(test, assert_instr(faddp))]
pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.faddp.v8f16"
)]
fn _vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
simd_add(even, odd)
}
unsafe { _vpaddq_f16(a, b) }
}
#[doc = "Floating-point add pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f32)"]
@@ -16041,14 +16038,11 @@ pub fn vpaddq_f16(a: float16x8_t, b: float16x8_t) -> float16x8_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(faddp))]
pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.faddp.v4f32"
)]
fn _vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
simd_add(even, odd)
}
unsafe { _vpaddq_f32(a, b) }
}
#[doc = "Floating-point add pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_f64)"]
@@ -16057,14 +16051,11 @@ pub fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(faddp))]
pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.faddp.v2f64"
)]
fn _vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
simd_add(even, odd)
}
unsafe { _vpaddq_f64(a, b) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s8)"]
@@ -16073,14 +16064,11 @@ pub fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.addp.v16i8"
)]
fn _vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
simd_add(even, odd)
}
unsafe { _vpaddq_s8(a, b) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s16)"]
@@ -16089,14 +16077,11 @@ pub fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.addp.v8i16"
)]
fn _vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
simd_add(even, odd)
}
unsafe { _vpaddq_s16(a, b) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s32)"]
@@ -16105,14 +16090,11 @@ pub fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.addp.v4i32"
)]
fn _vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t;
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
simd_add(even, odd)
}
unsafe { _vpaddq_s32(a, b) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_s64)"]
@@ -16121,119 +16103,62 @@ pub fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t {
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t {
unsafe extern "unadjusted" {
#[cfg_attr(
any(target_arch = "aarch64", target_arch = "arm64ec"),
link_name = "llvm.aarch64.neon.addp.v2i64"
)]
fn _vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t;
}
unsafe { _vpaddq_s64(a, b) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
#[inline(always)]
#[cfg(target_endian = "little")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
unsafe { transmute(vpaddq_s8(transmute(a), transmute(b))) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
#[inline(always)]
#[cfg(target_endian = "big")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
let a: uint8x16_t =
unsafe { simd_shuffle!(a, a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
let b: uint8x16_t =
unsafe { simd_shuffle!(b, b, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]) };
unsafe {
let ret_val: uint8x16_t = transmute(vpaddq_s8(transmute(a), transmute(b)));
simd_shuffle!(
ret_val,
ret_val,
[15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
)
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
simd_add(even, odd)
}
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u8)"]
#[inline(always)]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t {
unsafe {
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<16>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<16>());
simd_add(even, odd)
}
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
#[inline(always)]
#[cfg(target_endian = "little")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
unsafe { transmute(vpaddq_s16(transmute(a), transmute(b))) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u16)"]
#[inline(always)]
#[cfg(target_endian = "big")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t {
let a: uint16x8_t = unsafe { simd_shuffle!(a, a, [7, 6, 5, 4, 3, 2, 1, 0]) };
let b: uint16x8_t = unsafe { simd_shuffle!(b, b, [7, 6, 5, 4, 3, 2, 1, 0]) };
unsafe {
let ret_val: uint16x8_t = transmute(vpaddq_s16(transmute(a), transmute(b)));
simd_shuffle!(ret_val, ret_val, [7, 6, 5, 4, 3, 2, 1, 0])
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<8>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<8>());
simd_add(even, odd)
}
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
#[inline(always)]
#[cfg(target_endian = "little")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
unsafe { transmute(vpaddq_s32(transmute(a), transmute(b))) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u32)"]
#[inline(always)]
#[cfg(target_endian = "big")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t {
let a: uint32x4_t = unsafe { simd_shuffle!(a, a, [3, 2, 1, 0]) };
let b: uint32x4_t = unsafe { simd_shuffle!(b, b, [3, 2, 1, 0]) };
unsafe {
let ret_val: uint32x4_t = transmute(vpaddq_s32(transmute(a), transmute(b)));
simd_shuffle!(ret_val, ret_val, [3, 2, 1, 0])
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<4>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<4>());
simd_add(even, odd)
}
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
#[inline(always)]
#[cfg(target_endian = "little")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
unsafe { transmute(vpaddq_s64(transmute(a), transmute(b))) }
}
#[doc = "Add Pairwise"]
#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vpaddq_u64)"]
#[inline(always)]
#[cfg(target_endian = "big")]
#[target_feature(enable = "neon")]
#[stable(feature = "neon_intrinsics", since = "1.59.0")]
#[cfg_attr(test, assert_instr(addp))]
pub fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
let a: uint64x2_t = unsafe { simd_shuffle!(a, a, [1, 0]) };
let b: uint64x2_t = unsafe { simd_shuffle!(b, b, [1, 0]) };
unsafe {
let ret_val: uint64x2_t = transmute(vpaddq_s64(transmute(a), transmute(b)));
simd_shuffle!(ret_val, ret_val, [1, 0])
let even = simd_shuffle!(a, b, crate::core_arch::macros::even::<2>());
let odd = simd_shuffle!(a, b, crate::core_arch::macros::odd::<2>());
simd_add(even, odd)
}
}
#[doc = "Floating-point add pairwise"]
+23 -1
View File
@@ -187,9 +187,31 @@ macro_rules! simd_masked_store {
};
}
/// The first N even indices `[0, 2, 4, ...]`.
pub(crate) const fn even<const N: usize>() -> [u32; N] {
let mut out = [0u32; N];
let mut i = 0usize;
while i < N {
out[i] = (2 * i) as u32;
i += 1;
}
out
}
/// The first N odd indices `[1, 3, 5, ...]`.
pub(crate) const fn odd<const N: usize>() -> [u32; N] {
let mut out = [0u32; N];
let mut i = 0usize;
while i < N {
out[i] = (2 * i + 1) as u32;
i += 1;
}
out
}
/// Multiples of N offset by K `[K, K+N, K+2N, ...]`.
pub(crate) const fn deinterleave_mask<const LANES: usize, const N: usize, const K: usize>()
-> [u32; LANES] {
// Produces: [K, K+N, K+2N, ...]
let mut out = [0u32; LANES];
let mut i = 0usize;
while i < LANES {
@@ -6962,28 +6962,29 @@ intrinsics:
- FnCall: [simd_shuffle!, [a, a, "{type[3]}"]]
- FnCall: ["vmovl{neon_type[0].noq}", [a]]
- name: "vpadd{neon_type.no}"
doc: Floating-point add pairwise
arguments: ["a: {neon_type}", "b: {neon_type}"]
return_type: "{type}"
- name: "vpadd{neon_type[0].no}"
doc: "Floating-point add pairwise"
arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
return_type: "{neon_type[0]}"
attr: [*neon-stable]
assert_instr: [faddp]
safety: safe
types:
- float32x4_t
- float64x2_t
- [float32x4_t, "4"]
- [float64x2_t, "2"]
compose:
- LLVMLink:
name: "faddp.{neon_type}"
links:
- link: "llvm.aarch64.neon.faddp.{neon_type}"
arch: aarch64,arm64ec
- Let:
- even
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
- Let:
- odd
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
- FnCall: [simd_add, [even, odd]]
- name: "vpadd{neon_type.no}"
- name: "vpadd{neon_type[0].no}"
doc: Floating-point add pairwise
arguments: ["a: {neon_type}", "b: {neon_type}"]
return_type: "{type}"
arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
return_type: "{neon_type[0]}"
attr:
- *neon-fp16
- *neon-stable-fp16
@@ -6991,14 +6992,15 @@ intrinsics:
assert_instr: [faddp]
safety: safe
types:
- float16x8_t
- [float16x8_t, "8"]
compose:
- LLVMLink:
name: "faddp.{neon_type}"
links:
- link: "llvm.aarch64.neon.faddp.{neon_type}"
arch: aarch64,arm64ec
- Let:
- even
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
- Let:
- odd
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
- FnCall: [simd_add, [even, odd]]
- name: "vpmax{neon_type.no}"
doc: Floating-point add pairwise
@@ -13236,26 +13238,6 @@ intrinsics:
- link: "llvm.aarch64.neon.usqadd.{neon_type[1]}"
arch: aarch64,arm64ec
- name: "vpadd{neon_type.no}"
doc: "Add Pairwise"
arguments: ["a: {neon_type}", "b: {neon_type}"]
return_type: "{neon_type}"
attr:
- *neon-stable
assert_instr: [addp]
safety: safe
types:
- int8x16_t
- int16x8_t
- int32x4_t
- int64x2_t
compose:
- LLVMLink:
name: "vpadd{neon_type.no}"
links:
- link: "llvm.aarch64.neon.addp.{neon_type}"
arch: aarch64,arm64ec
- name: "vpadd{neon_type[0].no}"
doc: "Add Pairwise"
arguments: ["a: {neon_type[0]}", "b: {neon_type[0]}"]
@@ -13265,17 +13247,22 @@ intrinsics:
assert_instr: [addp]
safety: safe
types:
- [uint8x16_t, int8x16_t]
- [uint16x8_t, int16x8_t]
- [uint32x4_t, int32x4_t]
- [uint64x2_t, int64x2_t]
- [int8x16_t, "16"]
- [int16x8_t, "8"]
- [int32x4_t, "4"]
- [int64x2_t, "2"]
- [uint8x16_t, "16"]
- [uint16x8_t, "8"]
- [uint32x4_t, "4"]
- [uint64x2_t, "2"]
compose:
- FnCall:
- transmute
- - FnCall:
- 'vpadd{neon_type[1].no}'
- - FnCall: [transmute, [a]]
- FnCall: [transmute, [b]]
- Let:
- even
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::even::<{type[1]}>()"]]
- Let:
- odd
- FnCall: ["simd_shuffle!", [a, b, "crate::core_arch::macros::odd::<{type[1]}>()"]]
- FnCall: [simd_add, [even, odd]]
- name: "vpaddd_s64"
doc: "Add pairwise"