Change implementation of v{add,sub}{l,w}_high

2026-05-31 21:47:15 +03:00 · 2026-05-09 04:39:21 +05:30
parent 401582dd27
commit 63d8cef02f
4 changed files with 85 additions and 128 deletions
@@ -23299,11 +23299,9 @@ pub fn vsubh_f16(a: f16, b: f16) -> f16 {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))]
 pub fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
    unsafe {
-        let c: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let d: int16x8_t = simd_cast(c);
-        let e: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let f: int16x8_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: int16x8_t = simd_cast(vget_high_s8(a));
+        let d: int16x8_t = simd_cast(vget_high_s8(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Signed Subtract Long"]
@@ -23314,11 +23312,9 @@ pub fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))]
 pub fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
    unsafe {
-        let c: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
-        let d: int32x4_t = simd_cast(c);
-        let e: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
-        let f: int32x4_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: int32x4_t = simd_cast(vget_high_s16(a));
+        let d: int32x4_t = simd_cast(vget_high_s16(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Signed Subtract Long"]
@@ -23329,11 +23325,9 @@ pub fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubl2))]
 pub fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
    unsafe {
-        let c: int32x2_t = simd_shuffle!(a, a, [2, 3]);
-        let d: int64x2_t = simd_cast(c);
-        let e: int32x2_t = simd_shuffle!(b, b, [2, 3]);
-        let f: int64x2_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: int64x2_t = simd_cast(vget_high_s32(a));
+        let d: int64x2_t = simd_cast(vget_high_s32(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Unsigned Subtract Long"]
@@ -23344,11 +23338,9 @@ pub fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))]
 pub fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
    unsafe {
-        let c: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let d: uint16x8_t = simd_cast(c);
-        let e: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let f: uint16x8_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: uint16x8_t = simd_cast(vget_high_u8(a));
+        let d: uint16x8_t = simd_cast(vget_high_u8(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Unsigned Subtract Long"]
@@ -23359,11 +23351,9 @@ pub fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))]
 pub fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
    unsafe {
-        let c: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
-        let d: uint32x4_t = simd_cast(c);
-        let e: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
-        let f: uint32x4_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: uint32x4_t = simd_cast(vget_high_u16(a));
+        let d: uint32x4_t = simd_cast(vget_high_u16(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Unsigned Subtract Long"]
@@ -23374,11 +23364,9 @@ pub fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubl2))]
 pub fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
    unsafe {
-        let c: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
-        let d: uint64x2_t = simd_cast(c);
-        let e: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
-        let f: uint64x2_t = simd_cast(e);
-        simd_sub(d, f)
+        let c: uint64x2_t = simd_cast(vget_high_u32(a));
+        let d: uint64x2_t = simd_cast(vget_high_u32(b));
+        simd_sub(c, d)
    }
 }
 #[doc = "Signed Subtract Wide"]
@@ -23388,10 +23376,8 @@ pub fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))]
 pub fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
-    unsafe {
-        let c: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_s8(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Signed Subtract Wide"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s16)"]
@@ -23400,10 +23386,8 @@ pub fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))]
 pub fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
-    unsafe {
-        let c: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_s16(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Signed Subtract Wide"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_s32)"]
@@ -23412,10 +23396,8 @@ pub fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(ssubw2))]
 pub fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
-    unsafe {
-        let c: int32x2_t = simd_shuffle!(b, b, [2, 3]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_s32(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Unsigned Subtract Wide"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u8)"]
@@ -23424,10 +23406,8 @@ pub fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))]
 pub fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
-    unsafe {
-        let c: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_u8(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Unsigned Subtract Wide"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u16)"]
@@ -23436,10 +23416,8 @@ pub fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))]
 pub fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
-    unsafe {
-        let c: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_u16(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Unsigned Subtract Wide"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vsubw_high_u32)"]
@@ -23448,10 +23426,8 @@ pub fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 #[cfg_attr(all(test, target_endian = "little"), assert_instr(usubw2))]
 pub fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
-    unsafe {
-        let c: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
-        simd_sub(a, simd_cast(c))
-    }
+    let c = vget_high_u32(b);
+    unsafe { simd_sub(a, simd_cast(c)) }
 }
 #[doc = "Table look-up"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vtbl1_s8)"]
@@ -2499,9 +2499,9 @@ pub fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
+    let a: int16x4_t = vget_high_s16(a);
+    let b: int16x4_t = vget_high_s16(b);
    unsafe {
-        let a: int16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
-        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
        let a: int32x4_t = simd_cast(a);
        let b: int32x4_t = simd_cast(b);
        simd_add(a, b)
@@ -2530,9 +2530,9 @@ pub fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
+    let a: int32x2_t = vget_high_s32(a);
+    let b: int32x2_t = vget_high_s32(b);
    unsafe {
-        let a: int32x2_t = simd_shuffle!(a, a, [2, 3]);
-        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
        let a: int64x2_t = simd_cast(a);
        let b: int64x2_t = simd_cast(b);
        simd_add(a, b)
@@ -2561,9 +2561,9 @@ pub fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
+    let a: int8x8_t = vget_high_s8(a);
+    let b: int8x8_t = vget_high_s8(b);
    unsafe {
-        let a: int8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
        let a: int16x8_t = simd_cast(a);
        let b: int16x8_t = simd_cast(b);
        simd_add(a, b)
@@ -2592,9 +2592,9 @@ pub fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
+    let a: uint16x4_t = vget_high_u16(a);
+    let b: uint16x4_t = vget_high_u16(b);
    unsafe {
-        let a: uint16x4_t = simd_shuffle!(a, a, [4, 5, 6, 7]);
-        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
        let a: uint32x4_t = simd_cast(a);
        let b: uint32x4_t = simd_cast(b);
        simd_add(a, b)
@@ -2623,9 +2623,9 @@ pub fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
+    let a: uint32x2_t = vget_high_u32(a);
+    let b: uint32x2_t = vget_high_u32(b);
    unsafe {
-        let a: uint32x2_t = simd_shuffle!(a, a, [2, 3]);
-        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
        let a: uint64x2_t = simd_cast(a);
        let b: uint64x2_t = simd_cast(b);
        simd_add(a, b)
@@ -2654,9 +2654,9 @@ pub fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t {
+    let a: uint8x8_t = vget_high_u8(a);
+    let b: uint8x8_t = vget_high_u8(b);
    unsafe {
-        let a: uint8x8_t = simd_shuffle!(a, a, [8, 9, 10, 11, 12, 13, 14, 15]);
-        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
        let a: uint16x8_t = simd_cast(a);
        let b: uint16x8_t = simd_cast(b);
        simd_add(a, b)
@@ -2856,8 +2856,8 @@ pub fn vaddq_p128(a: p128, b: p128) -> p128 {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
+    let b = vget_high_s16(b);
    unsafe {
-        let b: int16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
        let b: int32x4_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -2885,8 +2885,8 @@ pub fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
+    let b = vget_high_s32(b);
    unsafe {
-        let b: int32x2_t = simd_shuffle!(b, b, [2, 3]);
        let b: int64x2_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -2914,8 +2914,8 @@ pub fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
+    let b = vget_high_s8(b);
    unsafe {
-        let b: int8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
        let b: int16x8_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -2943,8 +2943,8 @@ pub fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
+    let b = vget_high_u16(b);
    unsafe {
-        let b: uint16x4_t = simd_shuffle!(b, b, [4, 5, 6, 7]);
        let b: uint32x4_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -2972,8 +2972,8 @@ pub fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
+    let b = vget_high_u32(b);
    unsafe {
-        let b: uint32x2_t = simd_shuffle!(b, b, [2, 3]);
        let b: uint64x2_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -3001,8 +3001,8 @@ pub fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t {
    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t {
+    let b = vget_high_u8(b);
    unsafe {
-        let b: uint8x8_t = simd_shuffle!(b, b, [8, 9, 10, 11, 12, 13, 14, 15]);
        let b: uint16x8_t = simd_cast(b);
        simd_add(a, b)
    }
@@ -5667,14 +5667,13 @@ intrinsics:
      - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [ssubw2]]}]]
    safety: safe
    types:
-      - [int16x8_t, int8x16_t, int8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - [int32x4_t, int16x8_t, int16x4_t, '[4, 5, 6, 7]']
-      - [int64x2_t, int32x4_t, int32x2_t, '[2, 3]']
+      - [int16x8_t, int8x16_t]
+      - [int32x4_t, int16x8_t]
+      - [int64x2_t, int32x4_t]
    compose:
      - Let:
          - c
-          - "{neon_type[2]}"
-          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+          - FnCall: ['vget_high_{neon_type[1]}', [b]]
      - FnCall:
          - simd_sub
          - - a
@@ -5689,14 +5688,13 @@ intrinsics:
      - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [usubw2]]}]]
    safety: safe
    types:
-      - [uint16x8_t, uint8x16_t, uint8x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - [uint32x4_t, uint16x8_t, uint16x4_t, '[4, 5, 6, 7]']
-      - [uint64x2_t, uint32x4_t, uint32x2_t, '[2, 3]']
+      - [uint16x8_t, uint8x16_t]
+      - [uint32x4_t, uint16x8_t]
+      - [uint64x2_t, uint32x4_t]
    compose:
      - Let:
          - c
-          - "{neon_type[2]}"
-          - FnCall: [simd_shuffle!, [b, b, "{type[3]}"]]
+          - FnCall: ['vget_high_{neon_type[1]}', [b]]
      - FnCall:
          - simd_sub
          - - a
@@ -5711,27 +5709,19 @@ intrinsics:
      - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [ssubl2]]}]]
    safety: safe
    types:
-      - [int8x16_t, int16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', int8x8_t]
-      - [int16x8_t, int32x4_t, '[4, 5, 6, 7]', int16x4_t]
-      - [int32x4_t, int64x2_t, '[2, 3]', int32x2_t]
+      - [int8x16_t, int16x8_t]
+      - [int32x4_t, int64x2_t]
+      - [int16x8_t, int32x4_t]
    compose:
      - Let:
          - c
-          - "{neon_type[3]}"
-          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [a]]}]]
      - Let:
          - d
          - "{neon_type[1]}"
-          - FnCall: [simd_cast, [c]]
-      - Let:
-          - e
-          - "{neon_type[3]}"
-          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
-      - Let:
-          - f
-          - "{neon_type[1]}"
-          - FnCall: [simd_cast, [e]]
-      - FnCall: [simd_sub, [d, f]]
+          - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [b]]}]]
+      - FnCall: [simd_sub, [c, d]]

  - name: "vsubl_high{neon_type[0].noq}"
    doc: "Unsigned Subtract Long"
@@ -5742,27 +5732,19 @@ intrinsics:
      - FnCall: [cfg_attr, [*all-test-little-endian, {FnCall: [assert_instr, [usubl2]]}]]
    safety: safe
    types:
-      - [uint8x16_t, uint16x8_t, '[8, 9, 10, 11, 12, 13, 14, 15]', uint8x8_t]
-      - [uint16x8_t, uint32x4_t, '[4, 5, 6, 7]', uint16x4_t]
-      - [uint32x4_t, uint64x2_t, '[2, 3]', uint32x2_t]
+      - [uint8x16_t, uint16x8_t]
+      - [uint16x8_t, uint32x4_t]
+      - [uint32x4_t, uint64x2_t]
    compose:
      - Let:
          - c
-          - "{neon_type[3]}"
-          - FnCall: [simd_shuffle!, [a, a, "{type[2]}"]]
+          - "{neon_type[1]}"
+          - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [a]]}]]
      - Let:
          - d
          - "{neon_type[1]}"
-          - FnCall: [simd_cast, [c]]
-      - Let:
-          - e
-          - "{neon_type[3]}"
-          - FnCall: [simd_shuffle!, [b, b, "{type[2]}"]]
-      - Let:
-          - f
-          - "{neon_type[1]}"
-          - FnCall: [simd_cast, [e]]
-      - FnCall: [simd_sub, [d, f]]
+          - FnCall: [simd_cast, [{FnCall: ['vget_high_{neon_type[0]}', [b]]}]]
+      - FnCall: [simd_sub, [c, d]]

  - name: "vbcax{neon_type.no}"
    doc: Bit clear and exclusive OR
@@ -14478,21 +14478,21 @@ intrinsics:
      - *neon-cfg-arm-unstable
    safety: safe
    types:
-      - ['vaddl_high_s8', 'int8x16_t', 'int16x8_t', 'vaddl', 'saddl2',  'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - ['vaddl_high_s16', 'int16x8_t', 'int32x4_t', 'vaddl', 'saddl2',  'int16x4_t', '[4, 5, 6, 7]']
-      - ['vaddl_high_s32', 'int32x4_t', 'int64x2_t', 'vaddl', 'saddl2',  'int32x2_t', '[2, 3]']
-      - ['vaddl_high_u8', 'uint8x16_t', 'uint16x8_t', 'vaddl', 'uaddl2',  'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - ['vaddl_high_u16', 'uint16x8_t', 'uint32x4_t', 'vaddl', 'uaddl2',  'uint16x4_t', '[4, 5, 6, 7]']
-      - ['vaddl_high_u32', 'uint32x4_t', 'uint64x2_t', 'vaddl', 'uaddl2',  'uint32x2_t', '[2, 3]']
+      - ['vaddl_high_s8', 'int8x16_t', 'int16x8_t', 'vaddl', 'saddl2',  'int8x8_t']
+      - ['vaddl_high_s16', 'int16x8_t', 'int32x4_t', 'vaddl', 'saddl2',  'int16x4_t']
+      - ['vaddl_high_s32', 'int32x4_t', 'int64x2_t', 'vaddl', 'saddl2',  'int32x2_t']
+      - ['vaddl_high_u8', 'uint8x16_t', 'uint16x8_t', 'vaddl', 'uaddl2',  'uint8x8_t']
+      - ['vaddl_high_u16', 'uint16x8_t', 'uint32x4_t', 'vaddl', 'uaddl2',  'uint16x4_t']
+      - ['vaddl_high_u32', 'uint32x4_t', 'uint64x2_t', 'vaddl', 'uaddl2',  'uint32x2_t']
    compose:
      - Let:
          - a
          - '{neon_type[5]}'
-          - FnCall: ['simd_shuffle!', [a, a, '{type[6]}']]
+          - FnCall: ['vget_high_{neon_type[1]}', [a]]
      - Let:
          - b
          - '{neon_type[5]}'
-          - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']]
+          - FnCall: ['vget_high_{neon_type[1]}', [b]]
      - Let: [a, '{neon_type[2]}', {FnCall: [simd_cast, [a]]}]
      - Let: [b, '{neon_type[2]}', {FnCall: [simd_cast, [b]]}]
      - FnCall: [simd_add, [a, b]]
@@ -14534,17 +14534,16 @@ intrinsics:
      - *neon-cfg-arm-unstable
    safety: safe
    types:
-      - ['vaddw_high_s8', 'int16x8_t', 'int8x16_t', 'vaddw', 'saddw2', 'int8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - ['vaddw_high_s16', 'int32x4_t', 'int16x8_t', 'vaddw', 'saddw2', 'int16x4_t', '[4, 5, 6, 7]']
-      - ['vaddw_high_s32', 'int64x2_t', 'int32x4_t', 'vaddw', 'saddw2', 'int32x2_t', '[2, 3]']
-      - ['vaddw_high_u8', 'uint16x8_t', 'uint8x16_t', 'vaddw', 'uaddw2', 'uint8x8_t', '[8, 9, 10, 11, 12, 13, 14, 15]']
-      - ['vaddw_high_u16', 'uint32x4_t', 'uint16x8_t', 'vaddw', 'uaddw2', 'uint16x4_t', '[4, 5, 6, 7]']
-      - ['vaddw_high_u32', 'uint64x2_t', 'uint32x4_t', 'vaddw', 'uaddw2', 'uint32x2_t', '[2, 3]']
+      - ['vaddw_high_s8', 'int16x8_t', 'int8x16_t', 'vaddw', 'saddw2', 'int8x8_t']
+      - ['vaddw_high_s16', 'int32x4_t', 'int16x8_t', 'vaddw', 'saddw2', 'int16x4_t']
+      - ['vaddw_high_s32', 'int64x2_t', 'int32x4_t', 'vaddw', 'saddw2', 'int32x2_t']
+      - ['vaddw_high_u8', 'uint16x8_t', 'uint8x16_t', 'vaddw', 'uaddw2', 'uint8x8_t']
+      - ['vaddw_high_u16', 'uint32x4_t', 'uint16x8_t', 'vaddw', 'uaddw2', 'uint16x4_t']
+      - ['vaddw_high_u32', 'uint64x2_t', 'uint32x4_t', 'vaddw', 'uaddw2', 'uint32x2_t']
    compose:
      - Let:
          - b
-          - '{neon_type[5]}'
-          - FnCall: ['simd_shuffle!', [b, b, '{type[6]}']]
+          - FnCall: ['vget_high_{neon_type[2]}', [b]]
      - Let:
          - b
          - '{neon_type[1]}'