gen-arm: fix vmlal, vmlsl and vmull

2026-06-01 14:10:03 +03:00 · 2026-05-01 01:11:01 +05:30
parent 1964b04a31
commit a023ebf71f
2 changed files with 54 additions and 54 deletions
@@ -14520,7 +14520,7 @@ pub fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlal_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlal_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlal_high_s16(a, b, vdupq_lane_s16::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s16)"]
@@ -14535,7 +14535,7 @@ pub fn vmlal_high_laneq_s16<const LANE: i32>(
    c: int16x8_t,
 ) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmlal_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlal_high_s16(a, b, vdupq_laneq_s16::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_s32)"]
@@ -14546,7 +14546,7 @@ pub fn vmlal_high_laneq_s16<const LANE: i32>(
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlal_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmlal_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlal_high_s32(a, b, vdupq_lane_s32::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_s32)"]
@@ -14561,7 +14561,7 @@ pub fn vmlal_high_laneq_s32<const LANE: i32>(
    c: int32x4_t,
 ) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlal_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlal_high_s32(a, b, vdupq_laneq_s32::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u16)"]
@@ -14576,7 +14576,7 @@ pub fn vmlal_high_lane_u16<const LANE: i32>(
    c: uint16x4_t,
 ) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlal_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlal_high_u16(a, b, vdupq_lane_u16::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u16)"]
@@ -14591,7 +14591,7 @@ pub fn vmlal_high_laneq_u16<const LANE: i32>(
    c: uint16x8_t,
 ) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmlal_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlal_high_u16(a, b, vdupq_laneq_u16::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_lane_u32)"]
@@ -14606,7 +14606,7 @@ pub fn vmlal_high_lane_u32<const LANE: i32>(
    c: uint32x2_t,
 ) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmlal_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlal_high_u32(a, b, vdupq_lane_u32::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_laneq_u32)"]
@@ -14621,7 +14621,7 @@ pub fn vmlal_high_laneq_u32<const LANE: i32>(
    c: uint32x4_t,
 ) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlal_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlal_high_u32(a, b, vdupq_laneq_u32::<LANE>(c))
 }
 #[doc = "Multiply-add long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlal_high_n_s16)"]
@@ -14764,7 +14764,7 @@ pub fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlsl_high_lane_s16<const LANE: i32>(a: int32x4_t, b: int16x8_t, c: int16x4_t) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlsl_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlsl_high_s16(a, b, vdupq_lane_s16::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s16)"]
@@ -14779,7 +14779,7 @@ pub fn vmlsl_high_laneq_s16<const LANE: i32>(
    c: int16x8_t,
 ) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmlsl_high_s16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlsl_high_s16(a, b, vdupq_laneq_s16::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_s32)"]
@@ -14790,7 +14790,7 @@ pub fn vmlsl_high_laneq_s16<const LANE: i32>(
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmlsl_high_lane_s32<const LANE: i32>(a: int64x2_t, b: int32x4_t, c: int32x2_t) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmlsl_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlsl_high_s32(a, b, vdupq_lane_s32::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_s32)"]
@@ -14805,7 +14805,7 @@ pub fn vmlsl_high_laneq_s32<const LANE: i32>(
    c: int32x4_t,
 ) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlsl_high_s32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlsl_high_s32(a, b, vdupq_laneq_s32::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u16)"]
@@ -14820,7 +14820,7 @@ pub fn vmlsl_high_lane_u16<const LANE: i32>(
    c: uint16x4_t,
 ) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlsl_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlsl_high_u16(a, b, vdupq_lane_u16::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u16)"]
@@ -14835,7 +14835,7 @@ pub fn vmlsl_high_laneq_u16<const LANE: i32>(
    c: uint16x8_t,
 ) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmlsl_high_u16(a, b, simd_shuffle!(c, c, [LANE as u32; 8])) }
+    vmlsl_high_u16(a, b, vdupq_laneq_u16::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_lane_u32)"]
@@ -14850,7 +14850,7 @@ pub fn vmlsl_high_lane_u32<const LANE: i32>(
    c: uint32x2_t,
 ) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmlsl_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlsl_high_u32(a, b, vdupq_lane_u32::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_laneq_u32)"]
@@ -14865,7 +14865,7 @@ pub fn vmlsl_high_laneq_u32<const LANE: i32>(
    c: uint32x4_t,
 ) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmlsl_high_u32(a, b, simd_shuffle!(c, c, [LANE as u32; 4])) }
+    vmlsl_high_u32(a, b, vdupq_laneq_u32::<LANE>(c))
 }
 #[doc = "Multiply-subtract long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmlsl_high_n_s16)"]
@@ -15270,7 +15270,7 @@ pub fn vmulh_laneq_f16<const LANE: i32>(a: f16, b: float16x8_t) -> f16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmull_high_s16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
+    vmull_high_s16(a, vdupq_lane_s16::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s16)"]
@@ -15281,7 +15281,7 @@ pub fn vmull_high_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int32
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmull_high_s16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
+    vmull_high_s16(a, vdupq_laneq_s16::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_s32)"]
@@ -15292,7 +15292,7 @@ pub fn vmull_high_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int3
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmull_high_s32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
+    vmull_high_s32(a, vdupq_lane_s32::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_s32)"]
@@ -15303,7 +15303,7 @@ pub fn vmull_high_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int64
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmull_high_s32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
+    vmull_high_s32(a, vdupq_laneq_s32::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u16)"]
@@ -15314,7 +15314,7 @@ pub fn vmull_high_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int6
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmull_high_u16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
+    vmull_high_u16(a, vdupq_lane_u16::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u16)"]
@@ -15325,7 +15325,7 @@ pub fn vmull_high_lane_u16<const LANE: i32>(a: uint16x8_t, b: uint16x4_t) -> uin
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t {
    static_assert_uimm_bits!(LANE, 3);
-    unsafe { vmull_high_u16(a, simd_shuffle!(b, b, [LANE as u32; 8])) }
+    vmull_high_u16(a, vdupq_laneq_u16::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_lane_u32)"]
@@ -15336,7 +15336,7 @@ pub fn vmull_high_laneq_u16<const LANE: i32>(a: uint16x8_t, b: uint16x8_t) -> ui
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 1);
-    unsafe { vmull_high_u32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
+    vmull_high_u32(a, vdupq_lane_u32::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_laneq_u32)"]
@@ -15347,7 +15347,7 @@ pub fn vmull_high_lane_u32<const LANE: i32>(a: uint32x4_t, b: uint32x2_t) -> uin
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub fn vmull_high_laneq_u32<const LANE: i32>(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t {
    static_assert_uimm_bits!(LANE, 2);
-    unsafe { vmull_high_u32(a, simd_shuffle!(b, b, [LANE as u32; 4])) }
+    vmull_high_u32(a, vdupq_laneq_u32::<LANE>(b))
 }
 #[doc = "Multiply long"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vmull_high_n_s16)"]
@@ -11419,17 +11419,17 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
-      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
-      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
-      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
+      - [int32x4_t, int16x8_t, int16x4_t, '2']
+      - [int32x4_t, int16x8_t, int16x8_t, '3']
+      - [int64x2_t, int32x4_t, int32x2_t, '1']
+      - [int64x2_t, int32x4_t, int32x4_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
      - FnCall:
          - "vmlsl_high_{neon_type[1]}"
          - - a
            - b
-            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+            - FnCall: ['vdupq_lane{neon_type[2].no}', [c], [LANE]]

  - name: "vmlsl_high_lane{neon_type[2].no}"
    doc: "Multiply-subtract long"
@@ -11442,17 +11442,17 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32; 8]']
-      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32; 8]']
-      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32; 4]']
-      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32; 4]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
      - FnCall:
          - "vmlsl_high_{neon_type[1]}"
          - - a
            - b
-            - FnCall: [simd_shuffle!, [c, c, "{type[4]}"]]
+            - FnCall: ['vdupq_lane{neon_type[2].no}', [c], [LANE]]

  - name: "vclt{neon_type[0].no}"
    doc: "Floating-point compare less than"
@@ -11830,16 +11830,16 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [int16x8_t, int16x4_t, int32x4_t, '2', '[LANE as u32; 8]']
-      - [int16x8_t, int16x8_t, int32x4_t, '3', '[LANE as u32; 8]']
-      - [int32x4_t, int32x2_t, int64x2_t, '1', '[LANE as u32; 4]']
-      - [int32x4_t, int32x4_t, int64x2_t, '2', '[LANE as u32; 4]']
+      - [int16x8_t, int16x4_t, int32x4_t, '2']
+      - [int16x8_t, int16x8_t, int32x4_t, '3']
+      - [int32x4_t, int32x2_t, int64x2_t, '1']
+      - [int32x4_t, int32x4_t, int64x2_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
      - FnCall:
          - "vmull_high_{neon_type[0]}"
          - - a
-            - FnCall: [simd_shuffle!, [b, b, '{type[4]}']]
+            - FnCall: ['vdupq_lane{neon_type[1].no}', [b], [LANE]]

  - name: "vmull_high_lane{neon_type[1].no}"
    doc: "Multiply long"
@@ -11852,16 +11852,16 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [uint16x8_t, uint16x4_t, uint32x4_t, '2', '[LANE as u32; 8]']
-      - [uint16x8_t, uint16x8_t, uint32x4_t, '3', '[LANE as u32; 8]']
-      - [uint32x4_t, uint32x2_t, uint64x2_t, '1', '[LANE as u32; 4]']
-      - [uint32x4_t, uint32x4_t, uint64x2_t, '2', '[LANE as u32; 4]']
+      - [uint16x8_t, uint16x4_t, uint32x4_t, '2']
+      - [uint16x8_t, uint16x8_t, uint32x4_t, '3']
+      - [uint32x4_t, uint32x2_t, uint64x2_t, '1']
+      - [uint32x4_t, uint32x4_t, uint64x2_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, "{type[3]}"]]
      - FnCall:
          - "vmull_high_{neon_type[0]}"
          - - a
-            - FnCall: [simd_shuffle!, [b, b, '{type[4]}']]
+            - FnCall: ['vdupq_lane{neon_type[1].no}', [b], [LANE]]

  - name: "vrsqrte{neon_type.no}"
    doc: "Reciprocal square-root estimate."
@@ -12143,13 +12143,13 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [int32x4_t, int16x8_t, int16x4_t, '2', '[LANE as u32; 8]']
-      - [int32x4_t, int16x8_t, int16x8_t, '3', '[LANE as u32; 8]']
-      - [int64x2_t, int32x4_t, int32x2_t, '1', '[LANE as u32; 4]']
-      - [int64x2_t, int32x4_t, int32x4_t, '2', '[LANE as u32; 4]']
+      - [int32x4_t, int16x8_t, int16x4_t, '2']
+      - [int32x4_t, int16x8_t, int16x8_t, '3']
+      - [int64x2_t, int32x4_t, int32x2_t, '1']
+      - [int64x2_t, int32x4_t, int32x4_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
-      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]
+      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: ['vdupq_lane{neon_type[2].no}', [c], [LANE]]}]]

  - name: "vmlal_high_lane{neon_type[2].no}"
    doc: "Multiply-add long"
@@ -12162,13 +12162,13 @@ intrinsics:
    static_defs: ['const LANE: i32']
    safety: safe
    types:
-      - [uint32x4_t, uint16x8_t, uint16x4_t, '2', '[LANE as u32; 8]']
-      - [uint32x4_t, uint16x8_t, uint16x8_t, '3', '[LANE as u32; 8]']
-      - [uint64x2_t, uint32x4_t, uint32x2_t, '1', '[LANE as u32; 4]']
-      - [uint64x2_t, uint32x4_t, uint32x4_t, '2', '[LANE as u32; 4]']
+      - [uint32x4_t, uint16x8_t, uint16x4_t, '2']
+      - [uint32x4_t, uint16x8_t, uint16x8_t, '3']
+      - [uint64x2_t, uint32x4_t, uint32x2_t, '1']
+      - [uint64x2_t, uint32x4_t, uint32x4_t, '2']
    compose:
      - FnCall: [static_assert_uimm_bits!, [LANE, '{type[3]}']]
-      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: [simd_shuffle!, [c, c, '{type[4]}']]}]]
+      - FnCall: ['vmlal_high_{neon_type[2]}', [a, b, {FnCall: ['vdupq_lane{neon_type[2].no}', [c], [LANE]]}]]

  - name: "vrsrad_n_u64"
    doc: "Unsigned rounding shift right and accumulate."